Skip to content

Commit

Permalink
Implement CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
ZipFile committed Jul 28, 2019
1 parent 0b16a80 commit bd72190
Show file tree
Hide file tree
Showing 7 changed files with 469 additions and 9 deletions.
59 changes: 55 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@ Iterate large directories efficiently with python.
About
=====

``python-getdents`` is a simple wrapper around Linux system call ``getdents64`` (see ``man getdents`` for details). `Here's <http://be-n.com/spw/you-can-list-a-million-files-in-a-directory-but-not-with-ls.html>`_ some study on why ``ls``, ``os.listdir()`` and others are so slow when dealing with extremely large directories.

``python-getdents`` is a simple wrapper around Linux system call ``getdents64`` (see ``man getdents`` for details). `More details <http://be-n.com/spw/you-can-list-a-million-files-in-a-directory-but-not-with-ls.html>`_ on approach.

TODO
====

* Verify that implementation works on platforms other than ``x86_64``.


Install
=======

Expand All @@ -30,7 +28,7 @@ For development
python3 -m venv env
. env/bin/activate
pip install -e .
pip install -e .[test]
Run tests
=========
Expand Down Expand Up @@ -83,3 +81,56 @@ Advanced
)
os.close(fd)
CLI
---

Usage
~~~~~

::

python-getdents [-h] [-b N] [-o NAME] PATH

Options
~~~~~~~

+--------------------------+-------------------------------------------------+
| Option | Description |
+==========================+=================================================+
| ``-b N`` | Buffer size (in bytes) to allocate when |
| | iterating over directory. Default is 32768, the |
| | same value used by glibc, you probably want to |
+--------------------------+ increase this value. Try starting with 16777216 |
| ``--buffer-size N`` | (16 MiB). Best performance is achieved when |
| | buffer size rounds to size of the file system |
| | block. |
+--------------------------+-------------------------------------------------+
| ``-o NAME`` | Output format: |
| | |
| | * ``plain`` (default) Print only names. |
| | * ``csv`` Print as comma-separated values in |
+--------------------------+ order: inode, type, name. |
| ``--output-format NAME`` | * ``csv-headers`` Same as ``csv``, but print |
| | headers on the first line also. |
| | * ``json`` output as JSON array. |
| | * ``json-stream`` output each directory entry |
| | as single json object separated by newline. |
+--------------------------+-------------------------------------------------+

Exit codes
~~~~~~~~~~

* 3 - Requested buffer is too large
* 4 - ``PATH`` not found.
* 5 - ``PATH`` is not a directory.
* 6 - Not enough permissions to read contents of the ``PATH``.

Examples
~~~~~~~~

.. code-block:: sh
python-getdents /path/to/large/dir
python -m getdents /path/to/large/dir
python-getdents /path/to/large/dir -o csv -b 16777216 > dir.csv
8 changes: 8 additions & 0 deletions getdents/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from sys import exit

from . import __name__ as prog
from .cli import main


if __name__ == '__main__': # pragma: no cover
exit(main(prog=prog))
65 changes: 65 additions & 0 deletions getdents/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from argparse import ArgumentParser
from sys import stderr

from . import MIN_GETDENTS_BUFF_SIZE, getdents
from .formatters import FORMATTERS


def parse_args(args, prog):
parser = ArgumentParser(
prog=prog,
description='Print directory contents.',
)

parser.add_argument('path', metavar='PATH')
parser.add_argument(
'-b', '--buffer-size',
metavar='N',
type=int,
default=32768,
help=(
'Buffer size (in bytes) to allocate when iterating over directory'
),
)
parser.add_argument(
'-o', '--output-format',
metavar='NAME',
default='plain',
choices=list(FORMATTERS),
help='Output format: %s' % ', '.join(sorted(FORMATTERS)),
)

parsed_args = parser.parse_args(args)
buff_size = parsed_args.buffer_size

if buff_size < MIN_GETDENTS_BUFF_SIZE:
parser.error('Minimum buffer size is %s' % MIN_GETDENTS_BUFF_SIZE)

return parsed_args.path, buff_size, FORMATTERS[parsed_args.output_format]


def main(args=None, prog=None):
path, buff_size, fmt = parse_args(args, prog)

try:
fmt(getdents(path, buff_size=buff_size))
except MemoryError:
print(
'Not enough memory to allocate', buff_size, 'bytes of data',
file=stderr,
)
return 3
except FileNotFoundError as e:
print(e, file=stderr)
return 4
except NotADirectoryError as e:
print(e, file=stderr)
return 5
except PermissionError as e:
print(e, file=stderr)
return 6
except OSError as e:
print(e, file=stderr)
return 7

return 0
98 changes: 98 additions & 0 deletions getdents/formatters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from csv import writer as csv_writer
from itertools import chain
from json import dumps as json_dumps
from sys import stdout

from ._getdents import (
DT_BLK,
DT_CHR,
DT_DIR,
DT_FIFO,
DT_LNK,
DT_REG,
DT_SOCK,
DT_UNKNOWN,
)


HEADER = ('inode', 'type', 'name')
FORMATTERS = {}
TYPE_NAMES = {
DT_BLK: 'blk',
DT_CHR: 'chr',
DT_DIR: 'dir',
DT_FIFO: 'fifo',
DT_LNK: 'lnk',
DT_REG: 'reg',
DT_SOCK: 'sock',
DT_UNKNOWN: 'unknown',
}


def formatter(name, registry=FORMATTERS):
def deco(fn):
registry[name] = fn
return fn
return deco


@formatter('plain')
def format_plain(directory_entries, file=stdout):
for inode, type, name in directory_entries:
print(name, file=file)


class Echo:
def write(self, value):
return value


@formatter('csv')
def format_csv(directory_entries, file=stdout, headers=False):
writer = csv_writer(Echo())

for first in directory_entries:
if headers:
print(writer.writerow(HEADER), end='', file=file)

for inode, type, name in chain((first,), directory_entries):
print(
writer.writerow((inode, TYPE_NAMES[type], name)),
end='', file=file,
)


@formatter('csv-headers')
def format_csv_headers(directory_entries, file=stdout):
return format_csv(directory_entries, file=file, headers=True)


def json_encode(inode, type, name):
return json_dumps({
'inode': inode,
'type': TYPE_NAMES[type],
'name': name,
})


@formatter('json')
def format_json(directory_entries, file=stdout):
for inode, type, name in directory_entries:
print(
'[\n', json_encode(inode, type, name),
sep='', end='', file=file,
)

for inode, type, name in directory_entries:
print(
',\n', json_encode(inode, type, name),
sep='', end='', file=file,
)

print('\n]', file=file)


@formatter('json-stream')
def format_json_stream(directory_entries, file=stdout):
for inode, type, name in directory_entries:
print(json_encode(inode, type, name), file=file)
16 changes: 11 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/env python

from distutils.core import Extension
from setuptools import Extension, find_packages, setup

from setuptools import setup

tests_require = ['pytest', 'pretend']

setup(
name='getdents',
version='0.2',
version='0.3',
description='Python binding to linux syscall getdents64.',
long_description=open('README.rst').read(),
classifiers=[
Expand All @@ -22,12 +22,18 @@
author_email='[email protected]',
url='http://github.com/ZipFile/python-getdents',
license='BSD-2-Clause',
packages=['getdents'],
packages=find_packages(exclude=['tests']),
include_package_data=True,
zip_safe=False,
extras_require={
'test': tests_require,
},
ext_modules=[
Extension('getdents._getdents', sources=['getdents/_getdents.c']),
],
entry_points = {
'console_scripts': ['python-getdents=getdents.cli:main'],
},
setup_requires=['pytest-runner'],
tests_require=['pytest'],
tests_require=tests_require,
)
85 changes: 85 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import re

import pretend

from pytest import mark, raises

import getdents.cli as cli
from getdents.cli import main, parse_args
from getdents.formatters import (
FORMATTERS,
format_csv,
format_json,
format_plain,
)


@mark.parametrize(['args', 'expected'], [
(['/tmp'], ('/tmp', 32768, format_plain)),
(['-b', '1234', 'x', '-o', 'json'], ('x', 1234, format_json)),
([
'--buffer-size', '9999',
'--output-format', 'csv',
'xxx',
], ('xxx', 9999, format_csv)),
])
def test_parse_args(args, expected):
assert parse_args(args, 'test') == expected


def test_parse_args_min_buff_size(capsys):
with raises(SystemExit):
parse_args(['test', '-b', '0'], 'test')

_, err = capsys.readouterr()

assert re.search(r'Minimum buffer size is \d+', err) is not None


def test_main(monkeypatch):
directory_entries = pretend.stub()

@pretend.call_recorder
def format_test(directory_entries):
pass

@pretend.call_recorder
def getdents(path, buff_size=32768):
return directory_entries

monkeypatch.setitem(FORMATTERS, 'test', format_test)
monkeypatch.setattr(cli, 'getdents', getdents)

assert main(['x', '-o', 'test', '-b', '1024'], 'test') == 0
assert getdents.calls == [pretend.call('x', buff_size=1024)]
assert format_test.calls == [pretend.call(directory_entries)]


def test_main_memory_error(monkeypatch):
monkeypatch.setattr(cli, 'getdents', pretend.raiser(MemoryError))

assert main(['x']) == 3


def test_main_file_not_found_error(monkeypatch):
monkeypatch.setattr(cli, 'getdents', pretend.raiser(FileNotFoundError))

assert main(['x']) == 4


def test_main_not_a_directory_error(monkeypatch):
monkeypatch.setattr(cli, 'getdents', pretend.raiser(NotADirectoryError))

assert main(['x']) == 5


def test_main_permission_error(monkeypatch):
monkeypatch.setattr(cli, 'getdents', pretend.raiser(PermissionError))

assert main(['x']) == 6


def test_main_os_error(monkeypatch):
monkeypatch.setattr(cli, 'getdents', pretend.raiser(OSError))

assert main(['x']) == 7
Loading

0 comments on commit bd72190

Please sign in to comment.