Initial commit

Still need tests and to implement yamldata.
waylan · Mar 6, 2015 · 473433e · 473433e
commit 473433e
Show file tree

Hide file tree

Showing 7 changed files with 412 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,33 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,27 @@
+Copyright (c) 2015, Waylan Limberg
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or other
+materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may
+be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -0,0 +1,169 @@
+DocData
+=======
+
+A better Meta-Data handler for lightweight markup languages.
+
+Currently, DocData supports MultiMarkdown style meta-data and YAML meta-data.
+While those meta-data formats are generally used in Markdown documents, DocData
+can work with any lightweight markup language as long as the document begins with
+an appropriately formatted meta-data section.
+
+Note that DocData does not detect the style of meta-data used within a document.
+You need to know ahead of time which format is used and call the appropriate
+parser.  Additionally, each parser will return the meta-data in a format unique to
+that format, so ensure that your code works with the format you have chosen.
+
+The parser for each format will strip the meta-data from the document. At that
+point you can forward the document on to your lightweight markup processor of
+choice. You might even want to use some the meta-data to configure the behavior
+of your lightweight markup processor.
+
+YAML Meta-Data (not yet implemented)
+--------------
+
+Given a document that contains YAML style meta-data, simply pass it to
+the `docdata.yamldata.get_data` function:
+
+```python
+from docdata.yamldata import get_data
+
+doc, data = get_data(doc)
+```
+
+The `docdata.yamldata.get_data` function will return a tuple which contains the
+document with the meta-data removed and the meta-data as returned by the YAML
+parser. As YAML provides for and recognizes various types out-of-the-box,
+no additional features need to be provided. The document can now be passed to
+your lightweight markup processor of choice.
+
+MultiMarkdown Meta-Data
+-----------------------
+
+Given a document that contains MultiMarkdown style meta-data, simply pass it to
+the `docdata.mmddata.get_data` function:
+
+```python
+from docdata.mmddata import get_data
+
+doc, data = get_data(doc)
+```
+
+The `docdata.mmddata.get_data` function will return a tuple which contains the
+document with the meta-data removed and the meta-data as a Python dictionary.
+Now the document can be passed to your lightweight markup processor of choice.
+
+### Transformations
+
+Unlike other meta-data formats (such as YAML), MultiMarkdown style meta-data
+makes no assumptions about the types of the various values in the meta-data.
+In fact, each line of the data for a given key is an item in a list. Even
+a single line of meta-data results in a list with one item in it, each item
+being a string. To make your data more useful, DocData allows you to define
+transformations for various known keys in your meta-data.
+
+You can define the transformations as callables which accept a value and
+return a transformed value and register them by using the
+`docdata.mmddata.transformer` decorator:
+
+```python
+from docdata.mmddata import get_data, transformer
+from datetime import datetime
+
+@transformer('date')
+def date(value):
+    "Convert string (in YYYY/MM/DD format) to a datetime.datetime object. "
+    return datetime.strptime(value[0], '%Y/%m/%d')
+
+doc, data = get_data(doc)
+```
+
+Notice that the string `'date'` was passed to the `docdata.mmddata.transformer`
+decorator as the "key". That "key" corresponds to the key used in the meta-data.
+Therefore, the following meta-data would result in a `datetime.datetime` object
+being returned as the value of the 'date' key:
+
+```
+---
+Title: Some Title
+Date: 2015/03/05
+---
+```
+
+Note that with the above document, no transformer was defined for the "title" key.
+In that case, the title was passed through unaltered. However, you can define a
+default transformer for all unspecified keys. If you don't assign a key when
+registering a transformer, then that transformer is used as the default:
+
+```python
+@transformer()      # <= No key assigned here
+def default(value):
+    "The default transformer. "
+    return ' '.join(value)
+```
+
+### Tread Safety
+
+In the above examples, the transformers did not need to be passed to the
+meta-data parser. Still, `docdata.mmddata.get_data` did the right thing.
+This is because, behind the scenes, a global instance of a
+`docdata.mmddata.TransformerCollection` was defined within the `docdata.mmddata`
+module, and the `docdata.mmddata.transformer` decorator registers transformers
+with that collection.
+
+However, if you need to use unique collections of transformers (perhaps per
+request and/or per thread), you can create your own
+`docdata.mmddata.TransformerCollection` instance and work with that instance.
+In fact, the `TransformerCollection.register` method is a decorator you
+can use to register a transformer with a specific instance:
+
+```python
+from docdata.mmddata import get_data, TransformerCollection
+
+mytc = TransformerCollection()
+
+@mytc.register()
+def default(value):
+    return ' '.join(value)
+
+doc, data = get_data(doc, mytc)
+```
+
+Note that `docdata.mmddata.get_data` accepts a second optional argument, which
+must be an instance of the `docdata.mmddata.TransformerCollection` class.
+
+You can also pass in a dictionary of transformers when you create an instance
+of a `docdata.mmddata.TransformerCollection`:
+
+```python
+tc = TransformerCollection(
+    items={
+        'author': (lambda v: v[0].upper()),
+        'summary': (lambda v: '\n'.join(v)),
+        'tags': (lambda v: v)
+    },
+    default=(lambda v: ' '.join(v))
+)
+```
+
+Note that the "default" transformer must be assigned outside of the dictionary
+so that it is still possible to define a "default" key if necessary. This
+example is also interesting as each of the transformers are defined as
+lambda functions. A transformer can be any callable which accepts one argument
+and returns an object.
+
+### Raw Data
+
+If you would like the raw data without any transformations, you can use the
+`docdata.mmddata.get_raw_data` function. It simply accepts a document and
+returns a document and a dictionary of raw data :
+
+```python
+from docdata.mmddata import get_data
+
+doc, data = get_raw_data(doc)
+```
+
+While the `docdata.mmddata.get_data` function with no transformers defined would
+accomplish the same thing, using `docdata.mmddata.get_raw_data` should be slightly
+faster as it is unnecessary to iterate over the items and pass each one through
+a dummy, do-nothing transformer.
diff --git a/docdata/__init__.py b/docdata/__init__.py
@@ -0,0 +1 @@
+__version__ = '0.0.1'
diff --git a/docdata/mmddata.py b/docdata/mmddata.py
@@ -0,0 +1,148 @@
+"""
+MultiMarkdown Meta-Data
+
+Extracts, parses and transforms MultiMarkdown style data from documents.
+
+"""
+
+
+import re
+
+
+#####################################################################
+# Transformer Collection                                            #
+#####################################################################
+
+class TransformerCollection(object):
+    """
+    A collecton of transformers.
+
+    A transformer is a callable that accepts a single argument (the value to be transformed)
+    and returns a transformed value.
+    """
+
+    def __init__(self, items=None, default=None):
+        """
+        Create a transformer collection.
+
+        `items`: A dictionary which points to a transformer for each key (optional).
+
+        `default`: The default transformer (optional). If no default is provided,
+        then the values of unknown keys are returned unaltered.
+        """
+
+        self._registery = items or {}
+        self.default = default or (lambda v: v)
+
+    def register(self, key=None):
+        """
+        Decorator which registers a transformer for the given key.
+        
+        If no key is provided, a "default" transformer is registered.
+        """
+
+        def wrap(fn):
+            if key:
+                self._registery[key] = fn
+            else:
+                self.default = fn
+            return fn
+        return wrap
+
+    def transform(self, key, value):
+        """
+        Calls the transformer for the given key and returns the transformed value.
+        """
+
+        if key in self._registery:
+            return self._registery[key](value)
+        return self.default(value)
+
+    def transform_dict(self, data):
+        """
+        Calls the transformer for each item in a dictionary and returns a new dictionary.
+        """
+
+        newdata = {}
+        for k, v in data.items():
+            newdata[k] = self.transform(k, v)
+        return newdata
+
+
+# The global default transformer collection.
+tc = TransformerCollection()
+
+
+def transformer(key=None):
+    """
+    Decorator which registers a transformer for the given key.
+
+    If no key is provided, a "default" transformer is registered.
+    """
+
+    def wrap(fn):
+        tc.register(key)(fn)
+        return fn
+    return wrap
+
+
+#####################################################################
+# Data Parser                                                       #
+#####################################################################
+
+
+BEGIN_RE = re.compile(r'^-{3}(\s.*)?')
+META_RE = re.compile(r'^[ ]{0,3}(?P<key>[A-Za-z0-9_-]+):\s*(?P<value>.*)')
+META_MORE_RE = re.compile(r'^([ ]{4}|\t)(\s*)(?P<value>.*)')
+END_RE = re.compile(r'^(-{3}|\.{3})(\s.*)?')
+
+
+def get_raw_data(doc):
+    """
+    Extract raw meta-data from a text document.
+
+    Returns a tuple of document and a data dict.
+    """
+
+    lines = doc.replace('\r\n', '\n').replace('\r', '\n').split('\n')
+
+    if lines and BEGIN_RE.match(lines[0]):
+        lines.pop(0)
+
+    data = {}
+    key = None
+    while lines:
+        line = lines.pop(0)
+
+        if line.strip() == '' or END_RE.match(line):
+            break  # blank line or end deliminator - done
+        m1 = META_RE.match(line)
+        if m1:
+            key = m1.group('key').lower().strip()
+            value = m1.group('value').strip()
+            try:
+                data[key].append(value)
+            except KeyError:
+                data[key] = [value]
+        else:
+            m2 = META_MORE_RE.match(line)
+            if m2 and key:
+                # Add another line to existing key
+                data[key].append(m2.group('value').strip())
+            else:
+                lines.insert(0, line)
+                break  # no meta data - done
+    return '\n'.join(lines), data
+
+
+def get_data(doc, transformers=tc):
+    """
+    Extract meta-data from a text document.
+
+    `transformers`: A TransformerCollection used to transform data values.
+
+    Returns a tuple of document and a (transformed) data dict.
+    """
+
+    doc, rawdata = get_raw_data(doc)
+    return doc, transformers.transform_dict(rawdata)