update version to 0.3.0, doc improvements, fixes issue jamescasbon#16

Neomancy · Feb 8, 2012 · 3811a81 · 3811a81
1 parent 29b3373
commit 3811a81
Show file tree

Hide file tree

Showing 8 changed files with 104 additions and 32 deletions.
diff --git a/README.rst b/README.rst
@@ -8,7 +8,7 @@ specified in the meta-information lines --  specifically the ##INFO and
 against the reserved types mentioned in the spec.  Failing that, it will just
 return strings.
 
-There is currently one piece of interface: ``Reader``.  It takes a file-like
+There main interface is the class: ``Reader``.  It takes a file-like
 object and acts as a reader::
 
     >>> import vcf
@@ -18,13 +18,12 @@ object and acts as a reader::
     Record(CHROM=20, POS=14370, REF=G, ALT=['A'])
     Record(CHROM=20, POS=17330, REF=T, ALT=['A'])
     Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
-    Record(CHROM=20, POS=1230237, REF=T, ALT=['.'])
+    Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
     Record(CHROM=20, POS=1234567, REF=GTCT, ALT=['G', 'GTACT'])
 
 
 This produces a great deal of information, but it is conveniently accessed.
-The attributes of a Record are the 8 fixed fields from the VCF spec plus two
-more.  That is:
+The attributes of a Record are the 8 fixed fields from the VCF spec::
 
     * ``Record.CHROM``
     * ``Record.POS``
@@ -35,13 +34,13 @@ more.  That is:
     * ``Record.FILTER``
     * ``Record.INFO``
 
-plus three more attributes to handle genotype information:
+plus attributes to handle genotype information:
 
     * ``Record.FORMAT``
     * ``Record.samples``
     * ``Record.genotype``
 
-``samples`` and ``genotypes``, not being the title of any column, is left lowercase.  The format
+``samples`` and ``genotype``, not being the title of any column, are left lowercase.  The format
 of the fixed fields is from the spec.  Comma-separated lists in the VCF are
 converted to lists.  In particular, one-entry VCF lists are converted to
 one-entry Python lists (see, e.g., ``Record.ALT``).  Semicolon-delimited lists
@@ -57,7 +56,7 @@ a ``True`` value. Integers and floats are handled exactly as you'd expect::
     >>> print record.INFO['AF']
     [0.5]
 
-There are a number of convienience functions for each ``Record`` allowing you to
+There are a number of convienience methods and properties for each ``Record`` allowing you to
 examine properties of interest::
 
     >>> print record.num_called, record.call_rate, record.num_unknown
@@ -67,7 +66,8 @@ examine properties of interest::
     >>> print record.nucl_diversity, record.aaf
     0.6 0.5
     >>> print record.get_hets()
-    [Call(sample=NA00002, GT=1|0)]
+    [Call(sample=NA00002, GT=1|0, GQ=[48])]
+
 
 ``record.FORMAT`` will be a string specifying the format of the genotype
 fields.  In case the FORMAT column does not exist, ``record.FORMAT`` is
@@ -126,13 +126,27 @@ Random access is supported for files with tabix indexes.  Simply call fetch for
 region you are interested in::
 
     >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
-    >>> for record in vcf_reader.fetch('20', 1110696-1, 1230237):
+    >>> for record in vcf_reader.fetch('20', 1110696, 1230237):
     ...     print record
     Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
-    Record(CHROM=20, POS=1230237, REF=T, ALT=['.'])
+    Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
+
+Or extract a single row::
+
+    >>> print vcf_reader.fetch('20', 1110696)
+    Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
+
+
+The ``Writer`` class provides a way of writing a VCF file.  Currently, you must specify a
+template ``Reader`` which provides the metadata::
+
+    >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
+    >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'), vcf_reader)
+    >>> for record in vcf_reader:
+    ...     vcf_writer.write_record(record)
 
 
 An extensible script is available to filter vcf files in vcf_filter.py.  VCF filters
 declared by other packages will be available for use in this script.  Please
-see FILTERS.md for full description.
+see :doc:`FILTERS` for full description.
 
diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst
@@ -1,14 +1,22 @@
+Development
+===========
+
+Please use the repository at github: https://github.com/jamescasbon/PyVCF/
+Pull requests gladly accepted. 
+Issues should be reported at the github issue tracker.
+
 Changes
 =======
 
-Pending
--------
+0.3.0 Release
+-------------
 
 * Fix setup.py for python < 2.7
-* Add ``__eq__`` to ``_Record``
+* Add ``__eq__`` to ``_Record`` and ``_Call``
 * Add ``is_het`` and ``is_variant`` to ``_Call``
 * Drop aggressive parse mode: we're always aggressive.
-* Add tabix fetch for single calls
+* Add tabix fetch for single calls, fix one->zero based indexing
+* add prepend_chr mode for ``Reader`` to add `chr` to CHROM attributes
 
 0.2.2 Release
 -------------
@@ -44,5 +52,6 @@ Contributions
 -------------
 
 Project started by @jdoughertyii and taken over by @jamescasbon on 12th January 2011.
+Contributions from @arq5x, @brentp, @martijnvermaat, @ian1roberts.
 
 
diff --git a/docs/conf.py b/docs/conf.py
@@ -48,9 +48,10 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.2.2'
+import vcf
+version = vcf.VERSION
 # The full version, including alpha/beta/rc tags.
-release = '0.2.2'
+release = vcf.VERSION
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/index.rst b/docs/index.rst
@@ -12,6 +12,7 @@ Contents:
    FILTERS
    HISTORY
 
+
 Indices and tables
 ==================
 

diff --git a/setup.py b/setup.py
@@ -9,6 +9,7 @@
 except ImportError:
     requires.append('argparse')
 
+import vcf
 
 setup(
     name='PyVCF',
@@ -17,6 +18,7 @@
     author='James Casbon and @jdoughertyii',
     author_email='[email protected]',
     description='Variant Call Format (VCF) parser for python',
+    long_description=vcf.__doc__,
     test_suite='test.test_vcf.suite',
     requires=requires,
     entry_points = {
@@ -26,5 +28,14 @@
         ]
     },
     url='https://github.com/jamescasbon/PyVCF',
-    version='0.2.2'
+    version=vcf.VERSION,
+    classifiers = [
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Scientific/Engineering',
+      ],
+    keywords='bioinformatics',
 )
diff --git a/test/issue-16.vcf b/test/issue-16.vcf
@@ -0,0 +1,21 @@
+##fileformat=VCFv4.0
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
+20	1234568	.       G	.	.	PASS    NS=3;DP=9;AA=G  GT	./.	./.	./.
+
diff --git a/test/test_vcf.py b/test/test_vcf.py
@@ -227,8 +227,8 @@ def testFetchSite(self):
 
         site = self.reader.fetch('20', 14369)
         assert site is None
-        
-        
+
+
 
 
 class TestOpenMethods(unittest.TestCase):
@@ -299,6 +299,12 @@ def testApplyMultipleFilters(self):
         assert 'mgq50' in reader.filters
         assert 'sq30' in reader.filters
 
+class TestRegression(unittest.TestCase):
+
+    def test_issue_16(self):
+        reader = vcf.Reader(fh('issue-16.vcf'))
+        assert reader.next().QUAL == None
+
 
 
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput))
@@ -310,3 +316,4 @@ def testApplyMultipleFilters(self):
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg))
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord))
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall))
+suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression))
diff --git a/vcf.py b/vcf.py
@@ -19,7 +19,7 @@
     Record(CHROM=20, POS=14370, REF=G, ALT=['A'])
     Record(CHROM=20, POS=17330, REF=T, ALT=['A'])
     Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
-    Record(CHROM=20, POS=1230237, REF=T, ALT=['.'])
+    Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
     Record(CHROM=20, POS=1234567, REF=GTCT, ALT=['G', 'GTACT'])
 
 
@@ -67,7 +67,8 @@
     >>> print record.nucl_diversity, record.aaf
     0.6 0.5
     >>> print record.get_hets()
-    [Call(sample=NA00002, GT=1|0)]
+    [Call(sample=NA00002, GT=1|0, GQ=[48])]
+
 
 ``record.FORMAT`` will be a string specifying the format of the genotype
 fields.  In case the FORMAT column does not exist, ``record.FORMAT`` is
@@ -126,24 +127,29 @@
 region you are interested in::
 
     >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
-    >>> for record in vcf_reader.fetch('20', 1110696-1, 1230237):
+    >>> for record in vcf_reader.fetch('20', 1110696, 1230237):
     ...     print record
     Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
-    Record(CHROM=20, POS=1230237, REF=T, ALT=['.'])
+    Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
+
+Or extract a single row::
+
+    >>> print vcf_reader.fetch('20', 1110696)
+    Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
 
 
 The ``Writer`` class provides a way of writing a VCF file.  Currently, you must specify a
 template ``Reader`` which provides the metadata::
 
     >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
-    >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'))
+    >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'), vcf_reader)
     >>> for record in vcf_reader:
-    ...     print r
+    ...     vcf_writer.write_record(record)
 
 
 An extensible script is available to filter vcf files in vcf_filter.py.  VCF filters
 declared by other packages will be available for use in this script.  Please
-see FILTERS.md for full description.
+see :doc:`FILTERS` for full description.
 
 '''
 import collections
@@ -153,13 +159,15 @@
 import sys
 import itertools
 
-
 try:
     import pysam
 except ImportError:
     pysam = None
 
 
+VERSION = '0.3.0'
+
+
 # Metadata parsers/constants
 RESERVED_INFO = {
     'AA': 'String', 'AC': 'Integer', 'AF': 'Float', 'AN': 'Integer',
@@ -391,7 +399,7 @@ def __eq__(self, other):
 
     def __iter__(self):
         return iter(self.samples)
-        
+
     def __str__(self):
         return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__
 
@@ -692,7 +700,7 @@ def next(self):
         alt = self._map(str, row[4].split(','))
 
         if row[5] == '.':
-            qual = '.'
+            qual = None
         else:
             qual = float(row[5]) if '.' in row[5] else int(row[5])
         filt = row[6].split(';') if ';' in row[6] else row[6]
@@ -732,14 +740,14 @@ def fetch(self, chrom, start, end=None):
 
         # not sure why tabix needs position -1
         start = start - 1
-        
+
         if end is None:
             self.reader = self._tabix.fetch(chrom, start, start+1)
             try:
                 return self.next()
             except StopIteration:
                 return None
-                
+
         self.reader = self._tabix.fetch(chrom, start, end)
         return self
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,6 +12,7 @@ Contents: @@
        FILTERS
        HISTORY
     Indices and tables
     ==================
@@ Expand Down @@