diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9ae9c29 --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..e9db915 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,27 @@ +Copyright (c) 2015, Waylan Limberg +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or other +materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may +be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..c56cc80 --- /dev/null +++ b/README.md @@ -0,0 +1,169 @@ +DocData +======= + +A better Meta-Data handler for lightweight markup languages. + +Currently, DocData supports MultiMarkdown style meta-data and YAML meta-data. +While those meta-data formats are generally used in Markdown documents, DocData +can work with any lightweight markup language as long as the document begins with +an appropriately formatted meta-data section. + +Note that DocData does not detect the style of meta-data used within a document. +You need to know ahead of time which format is used and call the appropriate +parser. Additionally, each parser will return the meta-data in a format unique to +that format, so ensure that your code works with the format you have chosen. + +The parser for each format will strip the meta-data from the document. At that +point you can forward the document on to your lightweight markup processor of +choice. You might even want to use some the meta-data to configure the behavior +of your lightweight markup processor. + +YAML Meta-Data (not yet implemented) +-------------- + +Given a document that contains YAML style meta-data, simply pass it to +the `docdata.yamldata.get_data` function: + +```python +from docdata.yamldata import get_data + +doc, data = get_data(doc) +``` + +The `docdata.yamldata.get_data` function will return a tuple which contains the +document with the meta-data removed and the meta-data as returned by the YAML +parser. As YAML provides for and recognizes various types out-of-the-box, +no additional features need to be provided. The document can now be passed to +your lightweight markup processor of choice. + +MultiMarkdown Meta-Data +----------------------- + +Given a document that contains MultiMarkdown style meta-data, simply pass it to +the `docdata.mmddata.get_data` function: + +```python +from docdata.mmddata import get_data + +doc, data = get_data(doc) +``` + +The `docdata.mmddata.get_data` function will return a tuple which contains the +document with the meta-data removed and the meta-data as a Python dictionary. +Now the document can be passed to your lightweight markup processor of choice. + +### Transformations + +Unlike other meta-data formats (such as YAML), MultiMarkdown style meta-data +makes no assumptions about the types of the various values in the meta-data. +In fact, each line of the data for a given key is an item in a list. Even +a single line of meta-data results in a list with one item in it, each item +being a string. To make your data more useful, DocData allows you to define +transformations for various known keys in your meta-data. + +You can define the transformations as callables which accept a value and +return a transformed value and register them by using the +`docdata.mmddata.transformer` decorator: + +```python +from docdata.mmddata import get_data, transformer +from datetime import datetime + +@transformer('date') +def date(value): + "Convert string (in YYYY/MM/DD format) to a datetime.datetime object. " + return datetime.strptime(value[0], '%Y/%m/%d') + +doc, data = get_data(doc) +``` + +Notice that the string `'date'` was passed to the `docdata.mmddata.transformer` +decorator as the "key". That "key" corresponds to the key used in the meta-data. +Therefore, the following meta-data would result in a `datetime.datetime` object +being returned as the value of the 'date' key: + +``` +--- +Title: Some Title +Date: 2015/03/05 +--- +``` + +Note that with the above document, no transformer was defined for the "title" key. +In that case, the title was passed through unaltered. However, you can define a +default transformer for all unspecified keys. If you don't assign a key when +registering a transformer, then that transformer is used as the default: + +```python +@transformer() # <= No key assigned here +def default(value): + "The default transformer. " + return ' '.join(value) +``` + +### Tread Safety + +In the above examples, the transformers did not need to be passed to the +meta-data parser. Still, `docdata.mmddata.get_data` did the right thing. +This is because, behind the scenes, a global instance of a +`docdata.mmddata.TransformerCollection` was defined within the `docdata.mmddata` +module, and the `docdata.mmddata.transformer` decorator registers transformers +with that collection. + +However, if you need to use unique collections of transformers (perhaps per +request and/or per thread), you can create your own +`docdata.mmddata.TransformerCollection` instance and work with that instance. +In fact, the `TransformerCollection.register` method is a decorator you +can use to register a transformer with a specific instance: + +```python +from docdata.mmddata import get_data, TransformerCollection + +mytc = TransformerCollection() + +@mytc.register() +def default(value): + return ' '.join(value) + +doc, data = get_data(doc, mytc) +``` + +Note that `docdata.mmddata.get_data` accepts a second optional argument, which +must be an instance of the `docdata.mmddata.TransformerCollection` class. + +You can also pass in a dictionary of transformers when you create an instance +of a `docdata.mmddata.TransformerCollection`: + +```python +tc = TransformerCollection( + items={ + 'author': (lambda v: v[0].upper()), + 'summary': (lambda v: '\n'.join(v)), + 'tags': (lambda v: v) + }, + default=(lambda v: ' '.join(v)) +) +``` + +Note that the "default" transformer must be assigned outside of the dictionary +so that it is still possible to define a "default" key if necessary. This +example is also interesting as each of the transformers are defined as +lambda functions. A transformer can be any callable which accepts one argument +and returns an object. + +### Raw Data + +If you would like the raw data without any transformations, you can use the +`docdata.mmddata.get_raw_data` function. It simply accepts a document and +returns a document and a dictionary of raw data : + +```python +from docdata.mmddata import get_data + +doc, data = get_raw_data(doc) +``` + +While the `docdata.mmddata.get_data` function with no transformers defined would +accomplish the same thing, using `docdata.mmddata.get_raw_data` should be slightly +faster as it is unnecessary to iterate over the items and pass each one through +a dummy, do-nothing transformer. diff --git a/docdata/__init__.py b/docdata/__init__.py new file mode 100644 index 0000000..99c4176 --- /dev/null +++ b/docdata/__init__.py @@ -0,0 +1 @@ +__version__ = '0.0.1' \ No newline at end of file diff --git a/docdata/mmddata.py b/docdata/mmddata.py new file mode 100644 index 0000000..8a1df14 --- /dev/null +++ b/docdata/mmddata.py @@ -0,0 +1,148 @@ +""" +MultiMarkdown Meta-Data + +Extracts, parses and transforms MultiMarkdown style data from documents. + +""" + + +import re + + +##################################################################### +# Transformer Collection # +##################################################################### + +class TransformerCollection(object): + """ + A collecton of transformers. + + A transformer is a callable that accepts a single argument (the value to be transformed) + and returns a transformed value. + """ + + def __init__(self, items=None, default=None): + """ + Create a transformer collection. + + `items`: A dictionary which points to a transformer for each key (optional). + + `default`: The default transformer (optional). If no default is provided, + then the values of unknown keys are returned unaltered. + """ + + self._registery = items or {} + self.default = default or (lambda v: v) + + def register(self, key=None): + """ + Decorator which registers a transformer for the given key. + + If no key is provided, a "default" transformer is registered. + """ + + def wrap(fn): + if key: + self._registery[key] = fn + else: + self.default = fn + return fn + return wrap + + def transform(self, key, value): + """ + Calls the transformer for the given key and returns the transformed value. + """ + + if key in self._registery: + return self._registery[key](value) + return self.default(value) + + def transform_dict(self, data): + """ + Calls the transformer for each item in a dictionary and returns a new dictionary. + """ + + newdata = {} + for k, v in data.items(): + newdata[k] = self.transform(k, v) + return newdata + + +# The global default transformer collection. +tc = TransformerCollection() + + +def transformer(key=None): + """ + Decorator which registers a transformer for the given key. + + If no key is provided, a "default" transformer is registered. + """ + + def wrap(fn): + tc.register(key)(fn) + return fn + return wrap + + +##################################################################### +# Data Parser # +##################################################################### + + +BEGIN_RE = re.compile(r'^-{3}(\s.*)?') +META_RE = re.compile(r'^[ ]{0,3}(?P[A-Za-z0-9_-]+):\s*(?P.*)') +META_MORE_RE = re.compile(r'^([ ]{4}|\t)(\s*)(?P.*)') +END_RE = re.compile(r'^(-{3}|\.{3})(\s.*)?') + + +def get_raw_data(doc): + """ + Extract raw meta-data from a text document. + + Returns a tuple of document and a data dict. + """ + + lines = doc.replace('\r\n', '\n').replace('\r', '\n').split('\n') + + if lines and BEGIN_RE.match(lines[0]): + lines.pop(0) + + data = {} + key = None + while lines: + line = lines.pop(0) + + if line.strip() == '' or END_RE.match(line): + break # blank line or end deliminator - done + m1 = META_RE.match(line) + if m1: + key = m1.group('key').lower().strip() + value = m1.group('value').strip() + try: + data[key].append(value) + except KeyError: + data[key] = [value] + else: + m2 = META_MORE_RE.match(line) + if m2 and key: + # Add another line to existing key + data[key].append(m2.group('value').strip()) + else: + lines.insert(0, line) + break # no meta data - done + return '\n'.join(lines), data + + +def get_data(doc, transformers=tc): + """ + Extract meta-data from a text document. + + `transformers`: A TransformerCollection used to transform data values. + + Returns a tuple of document and a (transformed) data dict. + """ + + doc, rawdata = get_raw_data(doc) + return doc, transformers.transform_dict(rawdata) \ No newline at end of file diff --git a/docdata/yamldata.py b/docdata/yamldata.py new file mode 100644 index 0000000..5b6b38a --- /dev/null +++ b/docdata/yamldata.py @@ -0,0 +1,18 @@ +""" +YAML Meta-Data + +Extracts, parses and transforms YAML style data from documents. + +""" + + +import yaml + + +def get_data(doc): + """ + Extract meta-data from a text document. + + Returns a tuple of document and data. + """ + pass # TODO: implement this \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1600716 --- /dev/null +++ b/setup.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +from setuptools import setup +from docdata import __version__ as ver + +setup( + name='docdata', + description='A better Meta-Data handler for lightweight markup languages.', + author='Waylan Limberg', + author_email='waylan.limberg@icloud.com', + version=ver, + url='https://github.com/waylan/docdata', + packages=['docdata'], + install_requires = ['yaml'], + license='BSD' +) \ No newline at end of file