Added CLI, data downloader, tqdm, class-ified dictionary parser

Lots of changes throughout the project: - Added an argparse-based CLI to the project as a whole in __main__.py - Added data module for downloading and unpacking Words.zip - Added tqdm to the inflections and dictionary parsers for reporting progress - Made the dictionary parser a class; now checks base language (English) only once, and so runs much faster
badge · Nov 13, 2016 · 36d99d9 · 36d99d9
1 parent 0540c31
commit 36d99d9
Show file tree

Hide file tree

Showing 13 changed files with 383 additions and 124 deletions.
diff --git a/doll/__main__.py b/doll/__main__.py
@@ -0,0 +1,59 @@
+import doll.data
+import doll.input_parser
+import doll.parse_test
+import argparse
+
+description = """
+DDDDDDDDDDDDD                         LLLLLLLLLL        LLLLLLLLLL
+D::::::::::::DDD                      L::::::::L        L::::::::L
+D:::::::::::::::DD                    L::::::::L        L::::::::L
+DDD:::::DDDDD:::::D                   LL::::::LL        LL::::::LL
+  D:::::D    D:::::D    ooooooooooo     L::::L            L::::L
+  D:::::D     D:::::D oo:::::::::::oo   L::::L            L::::L
+  D:::::D     D:::::Do:::::::::::::::o  L::::L            L::::L
+  D:::::D     D:::::Do:::::ooooo:::::o  L::::L            L::::L
+  D:::::D     D:::::Do::::o     o::::o  L::::L            L::::L
+  D:::::D     D:::::Do::::o     o::::o  L::::L            L::::L
+  D:::::D     D:::::Do::::o     o::::o  L::::L            L::::L
+  D:::::D    D:::::D o::::o     o::::o  L::::L      LLLL  L::::L      LLLL
+DDD:::::DDDDD:::::D  o:::::ooooo:::::oLL::::::LLLLLL:::LLL::::::LLLLLL:::L
+D:::::::::::::::DD   o:::::::::::::::oL::::::::::::::::LL::::::::::::::::L
+D::::::::::::DDD      oo:::::::::::oo L::::::::::::::::LL::::::::::::::::L
+DDDDDDDDDDDDD           ooooooooooo   LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL
+
+The Database of Latin Lexicon
+
+An implementation of William Whitaker\'s Words\' data model in Python.
+
+This program comprises three parts:
+    - Downloader, to download the Words source files, contained in the 'data'
+      module
+    - Model, the sqlalchemy model of the data, contained in the 'db' module
+    - Parser, which ingests the Words source files and populates the database
+      with them, via the sqlalchemy model. This is contained in the
+      input_parser module.
+
+In addition, there is a parse_test script in the root directory,
+demonstrating a use of the package to replicate certain functionality
+of Words.
+"""
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("-f", "--force", action='store_true', help="Force a re-download of the words.zip file")
+    parser.add_argument("-b", "--build", action='store_true', help="Build the database")
+    parser.add_argument("-p", "--parse", action='store_true', help="Run the example parser")
+
+    args = parser.parse_args()
+
+    if args.force:
+        doll.data.download(create_dir=True)
+    if args.build:
+        doll.input_parser.parse_all_inputs(commit_changes=True)
+    if args.parse:
+        while True:
+            word = input('Enter a word to parse or type quit() to exit:\n=> ')
+            if word == 'quit()':
+                break
+            doll.parse_test.parse_word(word)
diff --git a/doll/config.py b/doll/config.py
@@ -0,0 +1,13 @@
+"""
+
+    DOLL Config File
+
+    This file contains the user-configurable elements of the Dictionary Of Latin Lexicon
+
+"""
+
+config = {
+    'db_file': 'doll.db',
+    'sqlalchemy.pool_recycle': '50',
+    'sqlalchemy.echo': 'false'
+}
diff --git a/doll/data/__init__.py b/doll/data/__init__.py
@@ -0,0 +1,78 @@
+from os import mkdir
+from sys import stdout
+from os.path import exists, expanduser, isdir, join
+from urllib.request import urlopen
+from zipfile import ZipFile
+
+
+def _doll_dir(create: bool = False):
+    """Find or create the doll data directory
+
+    :param create: whether to create the directory if it doesn't exist
+    :type create: bool
+
+    :return the directory, whether pre-existing or just-created
+    """
+
+    doll_dir = expanduser("~/.doll")
+
+    if not exists(doll_dir):
+        if not create:
+            raise RuntimeError("doll data directory does not exist and was not created at {}\n"
+                               "(rerun with create=True to create.)".format(doll_dir))
+        print("Creating ~/.doll directory")
+        try:
+            mkdir(doll_dir)
+        except OSError:
+            raise RuntimeError("Could not create doll data directory at {}".format(doll_dir))
+    else:
+        if not isdir(doll_dir):
+            raise RuntimeError("{0} exists but is not a directory".format(doll_dir))
+        else:
+            print("~/.doll directory already exists and was not created.")
+
+    return doll_dir
+
+
+def download(create_dir: bool = False):
+    """Download and extract the Words source files
+
+    :param create_dir: whether to create the directory if it doesn't exist
+    :type create_dir: bool
+
+    :return None
+    """
+
+    words_all = 'wordsall'
+    words_url = 'http://archives.nd.edu/whitaker/wordsall.zip'
+    data_dir = _doll_dir(create=create_dir)
+
+    url = urlopen(words_url)
+
+    # Download the file
+    with open(join(data_dir, words_all + '.zip'), 'wb') as file:
+        file_size = int(url.headers["Content-Length"])
+        print('Downloading {}.zip ({:,} bytes)'.format(words_all, file_size))
+
+        fetch_size = 0
+        block_size = 1024 * 64
+
+        while True:
+            data = url.read(block_size)
+            if not data:
+                break
+
+            fetch_size += len(data)
+            file.write(data)
+
+            status = '\r{:12,} bytes [{:5.1f}%]'.format(fetch_size, fetch_size * 100.0 / file_size)
+            stdout.write(status)
+            stdout.flush()
+
+    # Unpack the file
+    print('\nUnpacking {}'.format(words_all + '.zip'))
+
+    with ZipFile(join(data_dir, words_all + '.zip'), 'r') as zip_file:
+        zip_file.extractall(join(data_dir, words_all))
+
+    print('{} downloaded and extracted at {}'.format(words_all, data_dir))
diff --git a/doll/db/__init__.py b/doll/db/__init__.py
@@ -1,25 +1,20 @@
-__author__ = 'Matthew Badger'
-
-
-from os.path import dirname
+from os.path import expanduser
 from sqlalchemy import engine_from_config
 from sqlalchemy.orm import sessionmaker
+from ..config import config
+from .model import *
 
-from doll.db.config import config
-from doll.db.model import *
-
-
-'''Connection class
-
-    Connects to the database in the root folder of the application
-
-'''
 
 # Connects to the database
 class Connection:
+    """Connection
+
+    Connects to the database in the user's .doll directory
+    """
+
     config = config
 
-    config['sqlalchemy.url'] = 'sqlite:///' + dirname(__file__) + '/' + config['db_file']
+    config['sqlalchemy.url'] = 'sqlite:///' + expanduser("~/.doll") + '/' + config['db_file']
 
     __engine = engine_from_config(config, echo=False)
 

diff --git a/doll/db/config.py b/doll/db/config.py
diff --git a/doll/db/model.py b/doll/db/model.py
@@ -21,12 +21,12 @@
         
 """
 
-__author__ = 'Matthew Badger'
-
 from sqlalchemy import Column, Integer, String, ForeignKey, Boolean, Unicode
 from sqlalchemy.orm import relationship, backref
 from sqlalchemy.ext.declarative import declarative_base, declared_attr
 
+__author__ = 'Matthew Badger'
+
 
 Base = declarative_base()
 
@@ -38,7 +38,7 @@
 TypeBase, which defines an id, code, name and description.
 
 Most of the type classes define no other columns. The code matches that
-in Whitaker's source, and has a unique key. id is just for backrefernces
+in Whitaker's source, and has a unique key. id is just for back-references
 by sqlalchemy. name is hopefully a better thing to present to the user
 than the code.
 
@@ -51,6 +51,9 @@ class TypeBase(object):
     def __tablename__(self):
         return 'type_' + self.__name__.lower()
 
+    def __repr__(self):
+        return "{0}\t{1}\t{2} - {3}".format(self.id, self.code, self.name, self.description)
+
     id = Column(Integer, primary_key=True)
     code = Column(String(10), unique=True)
     name = Column(String(50))
@@ -121,6 +124,16 @@ def __lt__(self, other):
         return self.order < other.order
 
 
+class RealConjugation(TypeBase, Base):
+    """Real conjugation, which is a bit of a misnomer because
+    it's only a bit more real than Conjugation"""
+
+    order = Column(Integer)
+
+    def __lt__(self, other):
+        return self.order < other.order
+
+
 class Person(TypeBase, Base):
     """Person - First, Second or Third"""
 
@@ -170,7 +183,6 @@ class Language(TypeBase, Base):
     """Languages for translation"""
 
 
-
 """Inflection Record Classes.
 
 These classes define the inflection records, built from
@@ -202,7 +214,8 @@ class Record(Base):
 
     # Other columns
     stem_key = Column(Integer)
-    ending = Column(Unicode(20, collation='BINARY'))  # We use binary collation so a is not ā
+    ending = Column(Unicode(20, collation='BINARY'))  # We use binary collation so macrons are different
+    simple_ending = Column(Unicode(20))
     notes = Column(Unicode(200, collation='BINARY'))
 
     # Relationships
@@ -524,7 +537,8 @@ class Stem(Base):
                                           name='FK_dictionary_stem_entry_id'))
 
     stem_number = Column(Integer)
-    stem_word = Column(Unicode(20, collation='BINARY'))  # We use binary collation so a is not ā
+    stem_word = Column(Unicode(20, collation='BINARY'))  # We use binary collation so macrons are different
+    stem_simple_word = Column(Unicode(20))
 
     # Relationships
     entry = relationship('Entry', backref=backref('dictionary_stem'))
@@ -552,7 +566,6 @@ class TranslationSet(Base):
     translations = relationship('Translation', backref=backref('dictionary_translation'))
 
 
-
 # Translation
 class Translation(Base):
     """A translation of a word in a given language"""
@@ -568,7 +581,6 @@ class Translation(Base):
     translation_set = relationship('TranslationSet', backref=backref('dictionary_translation'))
 
 
-
 # Noun Entry
 class NounEntry(Base):
     """Noun entry in the dictionary"""
@@ -713,6 +725,9 @@ class VerbEntry(Base):
 
     conjugation_code = Column(String(10), ForeignKey('type_conjugation.code',
                                                      name='FK_dictionary_verb_conjugation_code'))
+
+    realconjugation_code = Column(String(10), ForeignKey('type_realconjugation.code',
+                                                         name='FK_dictionary_verb_realconjugation_code'))
     variant = Column(Integer)
 
     verb_kind_code = Column(String(10), ForeignKey('type_verbkind.code',
@@ -766,4 +781,4 @@ class InterjectionEntry(Base):
                                           name='FK_dictionary_interjection_entry_id'))
 
     # Relationships
-    entry = relationship('Entry', backref=backref('dictionary_interjection'))
+    entry = relationship('Entry', backref=backref('dictionary_interjection'))
diff --git a/doll/input_parser/__init__.py b/doll/input_parser/__init__.py
@@ -1,39 +1,47 @@
-__author__ = 'Matthew'
-
 import os
+from ..input_parser.add_database_types import create_type_contents
+from ..input_parser.parse_dictionary import parse_dict_file
+from ..input_parser.parse_inflections import parse_inflect_file
+from ..config import config
+
 
-from doll.input_parser.add_database_types import create_type_contents
-from doll.input_parser.parse_dictionary import parse_dict_file
-from doll.input_parser.parse_inflections import parse_inflect_file
+def parse_all_inputs(words_dir: str = os.path.expanduser('~/.doll/wordsall'), commit_changes: bool = False):
+    """Creates the database and parses all the inputs
 
+    :param words_dir: Directory of wordsall
+    :type words_dir: str
+    :param commit_changes: Whether to commit changes to the database
+    :type commit_changes: bool
 
-def parse_all_inputs(words_folder, commit_changes):
-    """Creates the database and parses all the inputs"""
+    :return None
+    """
 
     # Add a trailing slash if necessary
-    if (words_folder[-1:] != '/'):
-        words_folder += '/'
+    if words_dir[-1:] != '/':
+        words_dir += '/'
 
     # First check that our words folder exists
-    if not os.path.isdir(words_folder):
-        print('Cannot find words_folder at {0}! Exiting...'.format(words_folder))
+    if not os.path.isdir(words_dir):
+        print('Cannot find words_dir at {0}! Exiting...'.format(words_dir))
         return
 
     # And then that our input files exist
     files_to_find = ['INFLECTS.LAT', 'DICTLINE.GEN']
-    error_string = ', '.join([f for f in files_to_find if not os.path.isfile(words_folder + f)])
+    error_string = ', '.join([f for f in files_to_find if not os.path.isfile(words_dir + f)])
 
     if not error_string == '':
         print('Unable to find the following file(s): ' + error_string + '. Exiting...')
         return
 
-    '''if os.path.isfile(config['db_file']):
-        if not input('Database file exists, overwrite? ([Y]es/ No)')[:1] == 'Y':
+    if os.path.isfile(os.path.expanduser("~/.doll/") + config['db_file']):
+        if not input('Database file exists, overwrite? (Yes/ No)')[:1] == 'Y':
             print('Database file exists, exiting...')
-            return'''
+            return
+        else:
+            os.remove(os.path.expanduser("~/.doll/") + config['db_file'])
 
     create_type_contents()
 
-    parse_inflect_file(inflect_file=words_folder + 'INFLECTS.LAT', commit_changes=commit_changes)
+    parse_inflect_file(inflect_file=words_dir + 'INFLECTS.LAT', commit_changes=commit_changes)
 
-    parse_dict_file(dict_file=words_folder + 'DICTLINE.GEN', commit_changes=commit_changes)
+    parse_dict_file(dict_file=words_dir + 'DICTLINE.GEN', commit_changes=commit_changes)
diff --git a/doll/input_parser/add_database_types.py b/doll/input_parser/add_database_types.py
@@ -184,6 +184,17 @@ def create_type_contents():
     Connection.session.add(Conjugation(code=8, name='Sixth', description='', order=8))
     Connection.session.add(Conjugation(code=9, name='Sixth', description='', order=9))
 
+    Connection.session.add(RealConjugation(code=0, name='Unknown', description='', order=0))
+    Connection.session.add(RealConjugation(code=1, name='First', description='', order=1))
+    Connection.session.add(RealConjugation(code=2, name='Second', description='', order=2))
+    Connection.session.add(RealConjugation(code=3, name='Third', description='', order=3))
+    Connection.session.add(RealConjugation(code=4, name='Fourth', description='', order=5))
+    Connection.session.add(RealConjugation(code=5, name='Third -io', description='', order=4))
+    Connection.session.add(RealConjugation(code=6, name='Esse', description='', order=6))
+    Connection.session.add(RealConjugation(code=7, name='Eo', description='', order=7))
+    Connection.session.add(RealConjugation(code=8, name='Irregular', description='', order=8))
+    Connection.session.add(RealConjugation(code=9, name='Other', description='', order=9))
+
     Connection.session.add(Person(code=0, name='Unknown', description='All, none, or unknown'))
     Connection.session.add(Person(code=1, name='First', description=''))
     Connection.session.add(Person(code=2, name='Second', description=''))