biocore · charles-cowart · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
@@ -573,15 +573,17 @@ def generate_sample_info_files(self, addl_info=None):
             df = pd.concat([df, addl_info],
                            ignore_index=True).drop_duplicates()
 
-        df = df[df["sample_name"].str.startswith("BLANK") == True]  # noqa
+        blanks = df[df["sample_name"].str.startswith("BLANK")]
+        katharo = df[df["sample_name"].str.startswith("KATHARO")]
+        df = pd.concat([blanks, katharo])
         samples = list(df.to_records(index=False))
         projects = df.project_name.unique()
 
         paths = []
         for project in projects:
             samples_in_proj = [x for x, y in samples if y == project]
             some_path = join(self.output_path,
-                             f'{self.run_id}_{project}_blanks.tsv')
+                             f'{self.run_id}_{project}_wetlab.tsv')
             paths.append(some_path)
             with open(some_path, 'w') as f:
                 # write out header to disk
@@ -609,6 +611,13 @@ def generate_sample_info_files(self, addl_info=None):
                     row['description'] = sample.replace('_', '.')
                     row['collection_timestamp'] = self.get_date_from_run_id()
 
+                    # Although KATHARO samples may contain valuable metadata
+                    # in their columns, none of them appear to match the
+                    # expected values for SIF columns. Hence, the new columns
+                    # for KATHARO samples will be populated with the same
+                    # defaults as BLANKs.
+                    # (possible exception would be well_description and
+                    #  experiment_design_description -> description)
                     row = [row[x] for x in Pipeline.sif_header]
                     f.write('\t'.join(row) + '\n')
 

diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -365,11 +365,11 @@ def test_generate_sample_information_files(self):
         # filenames.
         obs = [x.split('sequence_processing_pipeline/')[1] for x in paths]
         exp = [(f'tests/data/output_dir/{self.good_run_id}'
-                '_NYU_BMS_Melanoma_13059_blanks.tsv'),
+                '_NYU_BMS_Melanoma_13059_wetlab.tsv'),
                (f'tests/data/output_dir/{self.good_run_id}'
-                '_Feist_11661_blanks.tsv'),
+                '_Feist_11661_wetlab.tsv'),
                (f'tests/data/output_dir/{self.good_run_id}'
-                '_Gerwick_6123_blanks.tsv')]
+                '_Gerwick_6123_wetlab.tsv')]
 
         # sort the lists to ensure both are in a fixed order.
         obs.sort()
@@ -380,13 +380,13 @@ def test_generate_sample_information_files(self):
         # confirm files contain the expected number of lines.
         # This is going to be based on the number of samples named 'BLANK*'
         # in good-sample-sheet.csv.
-        exp_lines = {f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv':
+        exp_lines = {f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv':
                      33,
-                     f'{self.good_run_id}_Feist_11661_blanks.tsv': 8,
-                     f'{self.good_run_id}_Gerwick_6123_blanks.tsv': 2}
+                     f'{self.good_run_id}_Feist_11661_wetlab.tsv': 8,
+                     f'{self.good_run_id}_Gerwick_6123_wetlab.tsv': 2}
 
         exp_first_lines = {
-            f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv':
+            f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv':
             'BLANK1.1A\t2021-10-21\t193\t'
             'Control\tNegative\tSterile w'
             'ater blank\turban biome\tres'
@@ -397,15 +397,15 @@ def test_generate_sample_information_files(self):
             'genome\t256318\tBLANK1.1A\tN'
             'YU_BMS_Melanoma\tTRUE\t'
             'UCSD\tFALSE',
-            f'{self.good_run_id}_Feist_11661_blanks.tsv':
+            f'{self.good_run_id}_Feist_11661_wetlab.tsv':
             'BLANK.40.12G\t2021-10-21\t193\tControl'
             '\tNegative\tSterile water blank\turban '
             'biome\tresearch facility\tsterile water'
             '\tmisc environment\tUSA:CA:San Diego\tB'
             'LANK.40.12G\t32.5\t-117.25\tcontrol bla'
             'nk\tmetagenome\t256318\tBLANK.40.12G\t'
             'Feist\tTRUE\tUCSD\tFALSE',
-            f'{self.good_run_id}_Gerwick_6123_blanks.tsv':
+            f'{self.good_run_id}_Gerwick_6123_wetlab.tsv':
             'BLANK.41.12G\t2021-10-21\t193\tControl'
             '\tNegative\tSterile water blank\turban'
             ' biome\tresearch facility\tsterile wat'
@@ -416,7 +416,7 @@ def test_generate_sample_information_files(self):
         }
 
         exp_last_lines = {
-            f'{self.good_run_id}_NYU_BMS_Melanoma_13059_blanks.tsv':
+            f'{self.good_run_id}_NYU_BMS_Melanoma_13059_wetlab.tsv':
             'BLANK4.4H\t2021-10-21\t193\t'
             'Control\tNegative\tSterile w'
             'ater blank\turban biome\tres'
@@ -427,15 +427,15 @@ def test_generate_sample_information_files(self):
             'genome\t256318\tBLANK4.4H\tN'
             'YU_BMS_Melanoma\tTRUE\t'
             'UCSD\tFALSE',
-            f'{self.good_run_id}_Feist_11661_blanks.tsv':
+            f'{self.good_run_id}_Feist_11661_wetlab.tsv':
             'BLANK.43.12H\t2021-10-21\t193\tControl'
             '\tNegative\tSterile water blank\turban'
             ' biome\tresearch facility\tsterile wat'
             'er\tmisc environment\tUSA:CA:San Diego'
             '\tBLANK.43.12H\t32.5\t-117.25\tcontrol'
             ' blank\tmetagenome\t256318\tBLANK.43.1'
             '2H\tFeist\tTRUE\tUCSD\tFALSE',
-            f'{self.good_run_id}_Gerwick_6123_blanks.tsv':
+            f'{self.good_run_id}_Gerwick_6123_wetlab.tsv':
             'BLANK.41.12G\t2021-10-21\t193\tContro'
             'l\tNegative\tSterile water blank\turb'
             'an biome\tresearch facility\tsterile '
@@ -1822,7 +1822,7 @@ def test_generate_sample_information_files(self):
         obs = [x.split('sequence_processing_pipeline/')[1] for x in paths]
 
         exp = [(f'tests/data/output_dir/{self.good_run_id}'
-                '_ABTX_20230208_ABTX_11052_blanks.tsv')]
+                '_ABTX_20230208_ABTX_11052_wetlab.tsv')]
 
         # sort the lists to ensure both are in a fixed order.
         obs.sort()