Skip to content

Commit

Permalink
Added CLI, data downloader, tqdm, class-ified dictionary parser
Browse files Browse the repository at this point in the history
Lots of changes throughout the project:

- Added an argparse-based CLI to the project as a whole in __main__.py
- Added data module for downloading and unpacking Words.zip
- Added tqdm to the inflections and dictionary parsers for reporting progress
- Made the dictionary parser a class; now checks base language (English)
  only once, and so runs much faster
  • Loading branch information
badge committed Nov 13, 2016
1 parent 0540c31 commit 36d99d9
Show file tree
Hide file tree
Showing 13 changed files with 383 additions and 124 deletions.
59 changes: 59 additions & 0 deletions doll/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import doll.data
import doll.input_parser
import doll.parse_test
import argparse

description = """
DDDDDDDDDDDDD LLLLLLLLLL LLLLLLLLLL
D::::::::::::DDD L::::::::L L::::::::L
D:::::::::::::::DD L::::::::L L::::::::L
DDD:::::DDDDD:::::D LL::::::LL LL::::::LL
D:::::D D:::::D ooooooooooo L::::L L::::L
D:::::D D:::::D oo:::::::::::oo L::::L L::::L
D:::::D D:::::Do:::::::::::::::o L::::L L::::L
D:::::D D:::::Do:::::ooooo:::::o L::::L L::::L
D:::::D D:::::Do::::o o::::o L::::L L::::L
D:::::D D:::::Do::::o o::::o L::::L L::::L
D:::::D D:::::Do::::o o::::o L::::L L::::L
D:::::D D:::::D o::::o o::::o L::::L LLLL L::::L LLLL
DDD:::::DDDDD:::::D o:::::ooooo:::::oLL::::::LLLLLL:::LLL::::::LLLLLL:::L
D:::::::::::::::DD o:::::::::::::::oL::::::::::::::::LL::::::::::::::::L
D::::::::::::DDD oo:::::::::::oo L::::::::::::::::LL::::::::::::::::L
DDDDDDDDDDDDD ooooooooooo LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL
The Database of Latin Lexicon
An implementation of William Whitaker\'s Words\' data model in Python.
This program comprises three parts:
- Downloader, to download the Words source files, contained in the 'data'
module
- Model, the sqlalchemy model of the data, contained in the 'db' module
- Parser, which ingests the Words source files and populates the database
with them, via the sqlalchemy model. This is contained in the
input_parser module.
In addition, there is a parse_test script in the root directory,
demonstrating a use of the package to replicate certain functionality
of Words.
"""

if __name__ == '__main__':

parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-f", "--force", action='store_true', help="Force a re-download of the words.zip file")
parser.add_argument("-b", "--build", action='store_true', help="Build the database")
parser.add_argument("-p", "--parse", action='store_true', help="Run the example parser")

args = parser.parse_args()

if args.force:
doll.data.download(create_dir=True)
if args.build:
doll.input_parser.parse_all_inputs(commit_changes=True)
if args.parse:
while True:
word = input('Enter a word to parse or type quit() to exit:\n=> ')
if word == 'quit()':
break
doll.parse_test.parse_word(word)
13 changes: 13 additions & 0 deletions doll/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""
DOLL Config File
This file contains the user-configurable elements of the Dictionary Of Latin Lexicon
"""

config = {
'db_file': 'doll.db',
'sqlalchemy.pool_recycle': '50',
'sqlalchemy.echo': 'false'
}
78 changes: 78 additions & 0 deletions doll/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from os import mkdir
from sys import stdout
from os.path import exists, expanduser, isdir, join
from urllib.request import urlopen
from zipfile import ZipFile


def _doll_dir(create: bool = False):
"""Find or create the doll data directory
:param create: whether to create the directory if it doesn't exist
:type create: bool
:return the directory, whether pre-existing or just-created
"""

doll_dir = expanduser("~/.doll")

if not exists(doll_dir):
if not create:
raise RuntimeError("doll data directory does not exist and was not created at {}\n"
"(rerun with create=True to create.)".format(doll_dir))
print("Creating ~/.doll directory")
try:
mkdir(doll_dir)
except OSError:
raise RuntimeError("Could not create doll data directory at {}".format(doll_dir))
else:
if not isdir(doll_dir):
raise RuntimeError("{0} exists but is not a directory".format(doll_dir))
else:
print("~/.doll directory already exists and was not created.")

return doll_dir


def download(create_dir: bool = False):
"""Download and extract the Words source files
:param create_dir: whether to create the directory if it doesn't exist
:type create_dir: bool
:return None
"""

words_all = 'wordsall'
words_url = 'http://archives.nd.edu/whitaker/wordsall.zip'
data_dir = _doll_dir(create=create_dir)

url = urlopen(words_url)

# Download the file
with open(join(data_dir, words_all + '.zip'), 'wb') as file:
file_size = int(url.headers["Content-Length"])
print('Downloading {}.zip ({:,} bytes)'.format(words_all, file_size))

fetch_size = 0
block_size = 1024 * 64

while True:
data = url.read(block_size)
if not data:
break

fetch_size += len(data)
file.write(data)

status = '\r{:12,} bytes [{:5.1f}%]'.format(fetch_size, fetch_size * 100.0 / file_size)
stdout.write(status)
stdout.flush()

# Unpack the file
print('\nUnpacking {}'.format(words_all + '.zip'))

with ZipFile(join(data_dir, words_all + '.zip'), 'r') as zip_file:
zip_file.extractall(join(data_dir, words_all))

print('{} downloaded and extracted at {}'.format(words_all, data_dir))
23 changes: 9 additions & 14 deletions doll/db/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
__author__ = 'Matthew Badger'


from os.path import dirname
from os.path import expanduser
from sqlalchemy import engine_from_config
from sqlalchemy.orm import sessionmaker
from ..config import config
from .model import *

from doll.db.config import config
from doll.db.model import *


'''Connection class
Connects to the database in the root folder of the application
'''

# Connects to the database
class Connection:
"""Connection
Connects to the database in the user's .doll directory
"""

config = config

config['sqlalchemy.url'] = 'sqlite:///' + dirname(__file__) + '/' + config['db_file']
config['sqlalchemy.url'] = 'sqlite:///' + expanduser("~/.doll") + '/' + config['db_file']

__engine = engine_from_config(config, echo=False)

Expand Down
5 changes: 0 additions & 5 deletions doll/db/config.py

This file was deleted.

33 changes: 24 additions & 9 deletions doll/db/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@
"""

__author__ = 'Matthew Badger'

from sqlalchemy import Column, Integer, String, ForeignKey, Boolean, Unicode
from sqlalchemy.orm import relationship, backref
from sqlalchemy.ext.declarative import declarative_base, declared_attr

__author__ = 'Matthew Badger'


Base = declarative_base()

Expand All @@ -38,7 +38,7 @@
TypeBase, which defines an id, code, name and description.
Most of the type classes define no other columns. The code matches that
in Whitaker's source, and has a unique key. id is just for backrefernces
in Whitaker's source, and has a unique key. id is just for back-references
by sqlalchemy. name is hopefully a better thing to present to the user
than the code.
Expand All @@ -51,6 +51,9 @@ class TypeBase(object):
def __tablename__(self):
return 'type_' + self.__name__.lower()

def __repr__(self):
return "{0}\t{1}\t{2} - {3}".format(self.id, self.code, self.name, self.description)

id = Column(Integer, primary_key=True)
code = Column(String(10), unique=True)
name = Column(String(50))
Expand Down Expand Up @@ -121,6 +124,16 @@ def __lt__(self, other):
return self.order < other.order


class RealConjugation(TypeBase, Base):
"""Real conjugation, which is a bit of a misnomer because
it's only a bit more real than Conjugation"""

order = Column(Integer)

def __lt__(self, other):
return self.order < other.order


class Person(TypeBase, Base):
"""Person - First, Second or Third"""

Expand Down Expand Up @@ -170,7 +183,6 @@ class Language(TypeBase, Base):
"""Languages for translation"""



"""Inflection Record Classes.
These classes define the inflection records, built from
Expand Down Expand Up @@ -202,7 +214,8 @@ class Record(Base):

# Other columns
stem_key = Column(Integer)
ending = Column(Unicode(20, collation='BINARY')) # We use binary collation so a is not ā
ending = Column(Unicode(20, collation='BINARY')) # We use binary collation so macrons are different
simple_ending = Column(Unicode(20))
notes = Column(Unicode(200, collation='BINARY'))

# Relationships
Expand Down Expand Up @@ -524,7 +537,8 @@ class Stem(Base):
name='FK_dictionary_stem_entry_id'))

stem_number = Column(Integer)
stem_word = Column(Unicode(20, collation='BINARY')) # We use binary collation so a is not ā
stem_word = Column(Unicode(20, collation='BINARY')) # We use binary collation so macrons are different
stem_simple_word = Column(Unicode(20))

# Relationships
entry = relationship('Entry', backref=backref('dictionary_stem'))
Expand Down Expand Up @@ -552,7 +566,6 @@ class TranslationSet(Base):
translations = relationship('Translation', backref=backref('dictionary_translation'))



# Translation
class Translation(Base):
"""A translation of a word in a given language"""
Expand All @@ -568,7 +581,6 @@ class Translation(Base):
translation_set = relationship('TranslationSet', backref=backref('dictionary_translation'))



# Noun Entry
class NounEntry(Base):
"""Noun entry in the dictionary"""
Expand Down Expand Up @@ -713,6 +725,9 @@ class VerbEntry(Base):

conjugation_code = Column(String(10), ForeignKey('type_conjugation.code',
name='FK_dictionary_verb_conjugation_code'))

realconjugation_code = Column(String(10), ForeignKey('type_realconjugation.code',
name='FK_dictionary_verb_realconjugation_code'))
variant = Column(Integer)

verb_kind_code = Column(String(10), ForeignKey('type_verbkind.code',
Expand Down Expand Up @@ -766,4 +781,4 @@ class InterjectionEntry(Base):
name='FK_dictionary_interjection_entry_id'))

# Relationships
entry = relationship('Entry', backref=backref('dictionary_interjection'))
entry = relationship('Entry', backref=backref('dictionary_interjection'))
42 changes: 25 additions & 17 deletions doll/input_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,47 @@
__author__ = 'Matthew'

import os
from ..input_parser.add_database_types import create_type_contents
from ..input_parser.parse_dictionary import parse_dict_file
from ..input_parser.parse_inflections import parse_inflect_file
from ..config import config


from doll.input_parser.add_database_types import create_type_contents
from doll.input_parser.parse_dictionary import parse_dict_file
from doll.input_parser.parse_inflections import parse_inflect_file
def parse_all_inputs(words_dir: str = os.path.expanduser('~/.doll/wordsall'), commit_changes: bool = False):
"""Creates the database and parses all the inputs
:param words_dir: Directory of wordsall
:type words_dir: str
:param commit_changes: Whether to commit changes to the database
:type commit_changes: bool
def parse_all_inputs(words_folder, commit_changes):
"""Creates the database and parses all the inputs"""
:return None
"""

# Add a trailing slash if necessary
if (words_folder[-1:] != '/'):
words_folder += '/'
if words_dir[-1:] != '/':
words_dir += '/'

# First check that our words folder exists
if not os.path.isdir(words_folder):
print('Cannot find words_folder at {0}! Exiting...'.format(words_folder))
if not os.path.isdir(words_dir):
print('Cannot find words_dir at {0}! Exiting...'.format(words_dir))
return

# And then that our input files exist
files_to_find = ['INFLECTS.LAT', 'DICTLINE.GEN']
error_string = ', '.join([f for f in files_to_find if not os.path.isfile(words_folder + f)])
error_string = ', '.join([f for f in files_to_find if not os.path.isfile(words_dir + f)])

if not error_string == '':
print('Unable to find the following file(s): ' + error_string + '. Exiting...')
return

'''if os.path.isfile(config['db_file']):
if not input('Database file exists, overwrite? ([Y]es/ No)')[:1] == 'Y':
if os.path.isfile(os.path.expanduser("~/.doll/") + config['db_file']):
if not input('Database file exists, overwrite? (Yes/ No)')[:1] == 'Y':
print('Database file exists, exiting...')
return'''
return
else:
os.remove(os.path.expanduser("~/.doll/") + config['db_file'])

create_type_contents()

parse_inflect_file(inflect_file=words_folder + 'INFLECTS.LAT', commit_changes=commit_changes)
parse_inflect_file(inflect_file=words_dir + 'INFLECTS.LAT', commit_changes=commit_changes)

parse_dict_file(dict_file=words_folder + 'DICTLINE.GEN', commit_changes=commit_changes)
parse_dict_file(dict_file=words_dir + 'DICTLINE.GEN', commit_changes=commit_changes)
11 changes: 11 additions & 0 deletions doll/input_parser/add_database_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,17 @@ def create_type_contents():
Connection.session.add(Conjugation(code=8, name='Sixth', description='', order=8))
Connection.session.add(Conjugation(code=9, name='Sixth', description='', order=9))

Connection.session.add(RealConjugation(code=0, name='Unknown', description='', order=0))
Connection.session.add(RealConjugation(code=1, name='First', description='', order=1))
Connection.session.add(RealConjugation(code=2, name='Second', description='', order=2))
Connection.session.add(RealConjugation(code=3, name='Third', description='', order=3))
Connection.session.add(RealConjugation(code=4, name='Fourth', description='', order=5))
Connection.session.add(RealConjugation(code=5, name='Third -io', description='', order=4))
Connection.session.add(RealConjugation(code=6, name='Esse', description='', order=6))
Connection.session.add(RealConjugation(code=7, name='Eo', description='', order=7))
Connection.session.add(RealConjugation(code=8, name='Irregular', description='', order=8))
Connection.session.add(RealConjugation(code=9, name='Other', description='', order=9))

Connection.session.add(Person(code=0, name='Unknown', description='All, none, or unknown'))
Connection.session.add(Person(code=1, name='First', description=''))
Connection.session.add(Person(code=2, name='Second', description=''))
Expand Down
Loading

0 comments on commit 36d99d9

Please sign in to comment.