diff --git a/CHANGELOG.md b/CHANGELOG.md index 17aaa38..d7437dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * Column zipping functionality * * Capturing Column name * * Regular expression column names * +* VCF file/table metadata storage * ## 11.2.1 / 2024-11-18 ### Fixed diff --git a/docs/vcf-file-metadata.md b/docs/vcf-file-metadata.md new file mode 100644 index 0000000..120bc61 --- /dev/null +++ b/docs/vcf-file-metadata.md @@ -0,0 +1,46 @@ +--- +layout: page +title: VCF File Metadata +permalink: /vcf-file-metadata/ +--- + +### Introduction +VCF files contain a header storing metadata, `NdrImport::Vcf::Table` now supports retrieval and storage of that data. + +### `vcf_file_metadata` +* `NdrImport::Vcf::Table` can optionally store `vcf_file_metadata`. This is a hash of { attribute name => regular expression }. +* The `NdrImport::File::Vcf` handler uses `vcf_file_metadata` to locate the metadata from within the file, then sets the `file_metadata` attribute as a hash of { attribute name => regular expression first captured group }. +* The `UniversalImporterHelper` then assigns the handler.file_metadata to the `NdrImport::Table` attribute `table_metadata`, which can then be accessed downstream. + + +### Example: +Given the below example data: + +``` + ##contig= + ##contig= + ##contig= + ##contig= + ##contig= + ##fileDate=2023-03-29 + ##reference=file:///data/humanGenome/hs37d5.fa + ##source=Platypus_Version_0.8.1 + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 +1 26387783 . G A 847.77 PASS AC=1;AF=0.500;AN=2;DP=85;set=Intersection GT:AD:DP:GQ:PL:SAC 0/1:52,32:84:99:876,0,1277:21,31,14,18 +``` + +The `NdrImport::Vcf::Table` mapping might look like: + +``` +- !ruby/object:NdrImport::Vcf::Table + filename_pattern: !ruby/regexp // + vcf_file_metadata: + genome_build: /##reference=file:///data/humanGenome\/(.+)\z/ + columns: + ... +``` + +This would result in a `table_metadata` value of: +``` +{ genome_build: 'hs37d5.fa' } +``` diff --git a/docs/xml-file-metadata.md b/docs/xml-file-metadata.md index 27e66e3..0207f46 100644 --- a/docs/xml-file-metadata.md +++ b/docs/xml-file-metadata.md @@ -1,6 +1,6 @@ --- layout: page -title: XML File Netdata +title: XML File Metdata permalink: /xml-file-metadata/ --- diff --git a/docs/yaml-mapping-user-guide.md b/docs/yaml-mapping-user-guide.md index 86b76ae..22539e5 100644 --- a/docs/yaml-mapping-user-guide.md +++ b/docs/yaml-mapping-user-guide.md @@ -15,4 +15,5 @@ add_to_nav: true 8. [XML mappings](xml-mappings.md) 9. [Zipped Field Mapping](zipped-field-mapping.md) 10. [Regular Expression Column Names](regexp-column-names.md) -11. [Capturing Column Names in Mapped Data](capturing-column-names.md) \ No newline at end of file +11. [Capturing Column Names in Mapped Data](capturing-column-names.md) +12. [VCF file metadata](vcf-file-metadata.md) \ No newline at end of file diff --git a/lib/ndr_import/file/vcf.rb b/lib/ndr_import/file/vcf.rb index 1f9c94d..1707c40 100644 --- a/lib/ndr_import/file/vcf.rb +++ b/lib/ndr_import/file/vcf.rb @@ -8,13 +8,38 @@ module NdrImport module File # This class is a vcf file handler that returns a single table. class Vcf < Base + attr_accessor :vcf_file_metadata + + def initialize(*) + super + + @vcf_file_metadata = @options['vcf_file_metadata'] + assign_file_metadata + end + private + def assign_file_metadata + return unless vcf_file_metadata.is_a?(Hash) + + file_metadata_hash = {} + + ::File.read(@filename).each_line do |line| + next unless line.match?(/^##/) + + vcf_file_metadata.each do |attribute, pattern| + file_metadata_hash[attribute] = line.match(pattern)[1].presence if line.match? pattern + end + end + + self.file_metadata = file_metadata_hash + end + def rows(&block) return enum_for(:rows) unless block ::File.read(@filename).each_line do |line| - next if line =~ /^##/ + next if line.match?(/^##/) yield BioVcf::VcfLine.parse(line) end diff --git a/lib/ndr_import/universal_importer_helper.rb b/lib/ndr_import/universal_importer_helper.rb index 7d4ad77..f24a225 100644 --- a/lib/ndr_import/universal_importer_helper.rb +++ b/lib/ndr_import/universal_importer_helper.rb @@ -51,22 +51,25 @@ def extract(source_file, &block) NdrImport::File::Registry.files(source_file, 'unzip_path' => unzip_path).each do |filename| # now at the individual file level, can we find the table mapping? table_mapping = get_table_mapping(filename, nil) - - options = { 'unzip_path' => unzip_path, - 'col_sep' => table_mapping.try(:delimiter), - 'file_password' => table_mapping.try(:file_password), - 'liberal_parsing' => table_mapping.try(:liberal_parsing), - 'xml_record_xpath' => table_mapping.try(:xml_record_xpath), - 'slurp' => table_mapping.try(:slurp), - 'yield_xml_record' => table_mapping.try(:yield_xml_record), - 'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath), - 'xml_file_metadata' => table_mapping.try(:xml_file_metadata) } + options = table_options_from(table_mapping).merge { 'unzip_path' => unzip_path } tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options) yield_tables_and_their_content(filename, tables, &block) end end + def table_options_from(table_mapping) + { 'col_sep' => table_mapping.try(:delimiter), + 'file_password' => table_mapping.try(:file_password), + 'liberal_parsing' => table_mapping.try(:liberal_parsing), + 'xml_record_xpath' => table_mapping.try(:xml_record_xpath), + 'slurp' => table_mapping.try(:slurp), + 'yield_xml_record' => table_mapping.try(:yield_xml_record), + 'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath), + 'xml_file_metadata' => table_mapping.try(:xml_file_metadata), + 'vcf_file_metadata' => table_mapping.try(:vcf_file_metadata) } + end + # This method does the table row yielding for the extract method, setting the notifier # so that we can monitor progress def yield_tables_and_their_content(filename, tables, &block) diff --git a/lib/ndr_import/vcf/table.rb b/lib/ndr_import/vcf/table.rb index 1086d06..4359bf7 100644 --- a/lib/ndr_import/vcf/table.rb +++ b/lib/ndr_import/vcf/table.rb @@ -6,7 +6,7 @@ module Vcf # All other Table logic is inherited from `NdrImport::Table` class Table < ::NdrImport::Table def self.all_valid_options - super - %w[delimiter header_lines footer_lines] + super - %w[delimiter header_lines footer_lines] + %w[vcf_file_metadata] end def header_lines diff --git a/test/file/vcf_test.rb b/test/file/vcf_test.rb index 565941d..2d88c1c 100644 --- a/test/file/vcf_test.rb +++ b/test/file/vcf_test.rb @@ -17,6 +17,22 @@ def setup assert(rows.all? { |row| row.is_a? Array }) assert_equal 7, rows.to_a.length end + + test 'should read vcf file metadata' do + vcf_file_mapping_metadata = { + 'genome_build' => %r{##reference=file.*?/humanGenome/(.+)}, + 'platypus_version' => /##source=Platypus_Version_([\d.]+)/ + } + options = { 'vcf_file_metadata' => vcf_file_mapping_metadata } + handler = NdrImport::File::Vcf.new(@file_path, nil, options) + + assert_equal vcf_file_mapping_metadata, handler.vcf_file_metadata + + expected_metadata = { 'genome_build' => 'hs37d5.fa', 'platypus_version' => '0.8.1' } + assert_equal expected_metadata, handler.file_metadata + tables = handler.send(:tables).to_a + assert_equal expected_metadata, tables.first.last + end end end end diff --git a/test/vcf/table_test.rb b/test/vcf/table_test.rb index 6b0aac8..5ba78a7 100644 --- a/test/vcf/table_test.rb +++ b/test/vcf/table_test.rb @@ -13,7 +13,7 @@ def setup test 'test_all_valid_options' do valid_options = %w[canonical_name columns file_password filename_pattern format klass last_data_column liberal_parsing row_identifier - significant_mapped_fields slurp tablename_pattern] + significant_mapped_fields slurp tablename_pattern vcf_file_metadata] assert_equal valid_options.sort, NdrImport::Vcf::Table.all_valid_options.sort end