From b23f359b0e9dda22d1da833bf604c781b05775a1 Mon Sep 17 00:00:00 2001
From: charlie <charlie@stacksubuntu.lan>
Date: Fri, 23 Feb 2024 04:18:24 +0000
Subject: [PATCH 1/2] add KARATHOseq samples to SIFs.

---
 sequence_processing_pipeline/Pipeline.py       | 11 ++++++++++-
 sequence_processing_pipeline/tests/test_Job.py | 11 +++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index c8881f2d..38af4073 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -573,7 +573,9 @@ def generate_sample_info_files(self, addl_info=None):
             df = pd.concat([df, addl_info],
                            ignore_index=True).drop_duplicates()
 
-        df = df[df["sample_name"].str.startswith("BLANK") == True]  # noqa
+        blanks = df[df["sample_name"].str.startswith("BLANK")]
+        katharo = df[df["sample_name"].str.startswith("KATHARO")]
+        df = pd.concat([blanks, katharo])
         samples = list(df.to_records(index=False))
         projects = df.project_name.unique()
 
@@ -609,6 +611,13 @@ def generate_sample_info_files(self, addl_info=None):
                     row['description'] = sample.replace('_', '.')
                     row['collection_timestamp'] = self.get_date_from_run_id()
 
+                    # Although KATHARO samples may contain valuable metadata
+                    # in their columns, none of them appear to match the
+                    # expected values for SIF columns. Hence, the new columns
+                    # for KATHARO samples will be populated with the same
+                    # defaults as BLANKs.
+                    # (possible exception would be well_description and
+                    #  experiment_design_description -> description)
                     row = [row[x] for x in Pipeline.sif_header]
                     f.write('\t'.join(row) + '\n')
 
diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py
index b01e651c..ca84e7d8 100644
--- a/sequence_processing_pipeline/tests/test_Job.py
+++ b/sequence_processing_pipeline/tests/test_Job.py
@@ -45,10 +45,13 @@ def my_callback(jid=None, status=None):
 
         obs = job._system_call('ls ' + join(package_root, 'tests', 'bin'),
                                callback=my_callback)
-        exp = {'stdout': 'bcl-convert\nbcl2fastq\nfastqc\n',
-               'stderr': '',
-               'return_code': 0}
-        self.assertDictEqual(obs, exp)
+
+        exp = ['bcl-convert\nbcl2fastq\nfastqc\n',
+               'bcl2fastq\nbcl-convert\nfastqc\n']
+
+        self.assertIn(obs['stdout'], exp)
+        self.assertEqual(obs['stderr'], '')
+        self.assertEqual(obs['return_code'], 0)
 
         for item in callback_results:
             self.assertTrue(isinstance(item[0], int))

From c5a221d1a5adc526478dfaee2a5c5d9c01ff4504 Mon Sep 17 00:00:00 2001
From: charlie <charlie@stacksubuntu.lan>
Date: Fri, 23 Feb 2024 04:45:07 +0000
Subject: [PATCH 2/2] changed SIF file-naming convention

from '..._blanks.tsv' to '..._wetlab.tsv', per request.
---
 sequence_processing_pipeline/Pipeline.py      |  2 +-
 .../tests/test_Pipeline.py                    | 26 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 38af4073..e56dd8c2 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -583,7 +583,7 @@ def generate_sample_info_files(self, addl_info=None):
         for project in projects:
             samples_in_proj = [x for x, y in samples if y == project]
             some_path = join(self.output_path,
-                             f'{self.run_id}_{project}_blanks.tsv')
+                             f'{self.run_id}_{project}_wetlab.tsv')
             paths.append(some_path)
             with open(some_path, 'w') as f:
                 # write out header to disk
diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
index 5887dfe4..2a58c6ed 100644
--- a/sequence_processing_pipeline/tests/test_Pipeline.py
+++ b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -365,11 +365,11 @@ def test_generate_sample_information_files(self):
         # filenames.
         obs = [x.split('sequence_processing_pipeline/')[1] for x in paths]
         exp = [(f'tests/data/output_dir/{self.good_run_id}'
-                '_NYU_BMS_Melanoma_13059_blanks.tsv'),
+                '_NYU_BMS_Melanoma_13059_wetlab.tsv'),
                (f'tests/data/output_dir/{self.good_run_id}'
-                '_Feist_11661_blanks.tsv'),
+                '_Feist_11661_wetlab.tsv'),
                (f'tests/data/output_dir/{self.good_run_id}'
-                '_Gerwick_6123_blanks.tsv')]
+                '_Gerwick_6123_wetlab.tsv')]
 
         # sort the lists to ensure both are in a fixed order.
         obs.sort()
@@ -380,13 +380,13 @@ def test_generate_sample_information_files(self):
         # confirm files contain the expected number of lines.
         # This is going to be based on the number of samples named 'BLANK*'
         # in good-sample-sheet.csv.
-        exp_lines = {f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv':
+        exp_lines = {f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv':
                      33,
-                     f'{self.good_run_id}_Feist_11661_blanks.tsv': 8,
-                     f'{self.good_run_id}_Gerwick_6123_blanks.tsv': 2}
+                     f'{self.good_run_id}_Feist_11661_wetlab.tsv': 8,
+                     f'{self.good_run_id}_Gerwick_6123_wetlab.tsv': 2}
 
         exp_first_lines = {
-            f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv':
+            f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv':
             'BLANK1.1A\t2021-10-21\t193\t'
             'Control\tNegative\tSterile w'
             'ater blank\turban biome\tres'
@@ -397,7 +397,7 @@ def test_generate_sample_information_files(self):
             'genome\t256318\tBLANK1.1A\tN'
             'YU_BMS_Melanoma\tTRUE\t'
             'UCSD\tFALSE',
-            f'{self.good_run_id}_Feist_11661_blanks.tsv':
+            f'{self.good_run_id}_Feist_11661_wetlab.tsv':
             'BLANK.40.12G\t2021-10-21\t193\tControl'
             '\tNegative\tSterile water blank\turban '
             'biome\tresearch facility\tsterile water'
@@ -405,7 +405,7 @@ def test_generate_sample_information_files(self):
             'LANK.40.12G\t32.5\t-117.25\tcontrol bla'
             'nk\tmetagenome\t256318\tBLANK.40.12G\t'
             'Feist\tTRUE\tUCSD\tFALSE',
-            f'{self.good_run_id}_Gerwick_6123_blanks.tsv':
+            f'{self.good_run_id}_Gerwick_6123_wetlab.tsv':
             'BLANK.41.12G\t2021-10-21\t193\tControl'
             '\tNegative\tSterile water blank\turban'
             ' biome\tresearch facility\tsterile wat'
@@ -416,7 +416,7 @@ def test_generate_sample_information_files(self):
         }
 
         exp_last_lines = {
-            f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv':
+            f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv':
             'BLANK4.4H\t2021-10-21\t193\t'
             'Control\tNegative\tSterile w'
             'ater blank\turban biome\tres'
@@ -427,7 +427,7 @@ def test_generate_sample_information_files(self):
             'genome\t256318\tBLANK4.4H\tN'
             'YU_BMS_Melanoma\tTRUE\t'
             'UCSD\tFALSE',
-            f'{self.good_run_id}_Feist_11661_blanks.tsv':
+            f'{self.good_run_id}_Feist_11661_wetlab.tsv':
             'BLANK.43.12H\t2021-10-21\t193\tControl'
             '\tNegative\tSterile water blank\turban'
             ' biome\tresearch facility\tsterile wat'
@@ -435,7 +435,7 @@ def test_generate_sample_information_files(self):
             '\tBLANK.43.12H\t32.5\t-117.25\tcontrol'
             ' blank\tmetagenome\t256318\tBLANK.43.1'
             '2H\tFeist\tTRUE\tUCSD\tFALSE',
-            f'{self.good_run_id}_Gerwick_6123_blanks.tsv':
+            f'{self.good_run_id}_Gerwick_6123_wetlab.tsv':
             'BLANK.41.12G\t2021-10-21\t193\tContro'
             'l\tNegative\tSterile water blank\turb'
             'an biome\tresearch facility\tsterile '
@@ -1822,7 +1822,7 @@ def test_generate_sample_information_files(self):
         obs = [x.split('sequence_processing_pipeline/')[1] for x in paths]
 
         exp = [(f'tests/data/output_dir/{self.good_run_id}'
-                '_ABTX_20230208_ABTX_11052_blanks.tsv')]
+                '_ABTX_20230208_ABTX_11052_wetlab.tsv')]
 
         # sort the lists to ensure both are in a fixed order.
         obs.sort()