-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added CLI, data downloader, tqdm, class-ified dictionary parser
Lots of changes throughout the project: - Added an argparse-based CLI to the project as a whole in __main__.py - Added data module for downloading and unpacking Words.zip - Added tqdm to the inflections and dictionary parsers for reporting progress - Made the dictionary parser a class; now checks base language (English) only once, and so runs much faster
- Loading branch information
Showing
13 changed files
with
383 additions
and
124 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import doll.data | ||
import doll.input_parser | ||
import doll.parse_test | ||
import argparse | ||
|
||
description = """ | ||
DDDDDDDDDDDDD LLLLLLLLLL LLLLLLLLLL | ||
D::::::::::::DDD L::::::::L L::::::::L | ||
D:::::::::::::::DD L::::::::L L::::::::L | ||
DDD:::::DDDDD:::::D LL::::::LL LL::::::LL | ||
D:::::D D:::::D ooooooooooo L::::L L::::L | ||
D:::::D D:::::D oo:::::::::::oo L::::L L::::L | ||
D:::::D D:::::Do:::::::::::::::o L::::L L::::L | ||
D:::::D D:::::Do:::::ooooo:::::o L::::L L::::L | ||
D:::::D D:::::Do::::o o::::o L::::L L::::L | ||
D:::::D D:::::Do::::o o::::o L::::L L::::L | ||
D:::::D D:::::Do::::o o::::o L::::L L::::L | ||
D:::::D D:::::D o::::o o::::o L::::L LLLL L::::L LLLL | ||
DDD:::::DDDDD:::::D o:::::ooooo:::::oLL::::::LLLLLL:::LLL::::::LLLLLL:::L | ||
D:::::::::::::::DD o:::::::::::::::oL::::::::::::::::LL::::::::::::::::L | ||
D::::::::::::DDD oo:::::::::::oo L::::::::::::::::LL::::::::::::::::L | ||
DDDDDDDDDDDDD ooooooooooo LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL | ||
The Database of Latin Lexicon | ||
An implementation of William Whitaker\'s Words\' data model in Python. | ||
This program comprises three parts: | ||
- Downloader, to download the Words source files, contained in the 'data' | ||
module | ||
- Model, the sqlalchemy model of the data, contained in the 'db' module | ||
- Parser, which ingests the Words source files and populates the database | ||
with them, via the sqlalchemy model. This is contained in the | ||
input_parser module. | ||
In addition, there is a parse_test script in the root directory, | ||
demonstrating a use of the package to replicate certain functionality | ||
of Words. | ||
""" | ||
|
||
if __name__ == '__main__': | ||
|
||
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) | ||
parser.add_argument("-f", "--force", action='store_true', help="Force a re-download of the words.zip file") | ||
parser.add_argument("-b", "--build", action='store_true', help="Build the database") | ||
parser.add_argument("-p", "--parse", action='store_true', help="Run the example parser") | ||
|
||
args = parser.parse_args() | ||
|
||
if args.force: | ||
doll.data.download(create_dir=True) | ||
if args.build: | ||
doll.input_parser.parse_all_inputs(commit_changes=True) | ||
if args.parse: | ||
while True: | ||
word = input('Enter a word to parse or type quit() to exit:\n=> ') | ||
if word == 'quit()': | ||
break | ||
doll.parse_test.parse_word(word) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
""" | ||
DOLL Config File | ||
This file contains the user-configurable elements of the Dictionary Of Latin Lexicon | ||
""" | ||
|
||
config = { | ||
'db_file': 'doll.db', | ||
'sqlalchemy.pool_recycle': '50', | ||
'sqlalchemy.echo': 'false' | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from os import mkdir | ||
from sys import stdout | ||
from os.path import exists, expanduser, isdir, join | ||
from urllib.request import urlopen | ||
from zipfile import ZipFile | ||
|
||
|
||
def _doll_dir(create: bool = False): | ||
"""Find or create the doll data directory | ||
:param create: whether to create the directory if it doesn't exist | ||
:type create: bool | ||
:return the directory, whether pre-existing or just-created | ||
""" | ||
|
||
doll_dir = expanduser("~/.doll") | ||
|
||
if not exists(doll_dir): | ||
if not create: | ||
raise RuntimeError("doll data directory does not exist and was not created at {}\n" | ||
"(rerun with create=True to create.)".format(doll_dir)) | ||
print("Creating ~/.doll directory") | ||
try: | ||
mkdir(doll_dir) | ||
except OSError: | ||
raise RuntimeError("Could not create doll data directory at {}".format(doll_dir)) | ||
else: | ||
if not isdir(doll_dir): | ||
raise RuntimeError("{0} exists but is not a directory".format(doll_dir)) | ||
else: | ||
print("~/.doll directory already exists and was not created.") | ||
|
||
return doll_dir | ||
|
||
|
||
def download(create_dir: bool = False): | ||
"""Download and extract the Words source files | ||
:param create_dir: whether to create the directory if it doesn't exist | ||
:type create_dir: bool | ||
:return None | ||
""" | ||
|
||
words_all = 'wordsall' | ||
words_url = 'http://archives.nd.edu/whitaker/wordsall.zip' | ||
data_dir = _doll_dir(create=create_dir) | ||
|
||
url = urlopen(words_url) | ||
|
||
# Download the file | ||
with open(join(data_dir, words_all + '.zip'), 'wb') as file: | ||
file_size = int(url.headers["Content-Length"]) | ||
print('Downloading {}.zip ({:,} bytes)'.format(words_all, file_size)) | ||
|
||
fetch_size = 0 | ||
block_size = 1024 * 64 | ||
|
||
while True: | ||
data = url.read(block_size) | ||
if not data: | ||
break | ||
|
||
fetch_size += len(data) | ||
file.write(data) | ||
|
||
status = '\r{:12,} bytes [{:5.1f}%]'.format(fetch_size, fetch_size * 100.0 / file_size) | ||
stdout.write(status) | ||
stdout.flush() | ||
|
||
# Unpack the file | ||
print('\nUnpacking {}'.format(words_all + '.zip')) | ||
|
||
with ZipFile(join(data_dir, words_all + '.zip'), 'r') as zip_file: | ||
zip_file.extractall(join(data_dir, words_all)) | ||
|
||
print('{} downloaded and extracted at {}'.format(words_all, data_dir)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,39 +1,47 @@ | ||
__author__ = 'Matthew' | ||
|
||
import os | ||
from ..input_parser.add_database_types import create_type_contents | ||
from ..input_parser.parse_dictionary import parse_dict_file | ||
from ..input_parser.parse_inflections import parse_inflect_file | ||
from ..config import config | ||
|
||
|
||
from doll.input_parser.add_database_types import create_type_contents | ||
from doll.input_parser.parse_dictionary import parse_dict_file | ||
from doll.input_parser.parse_inflections import parse_inflect_file | ||
def parse_all_inputs(words_dir: str = os.path.expanduser('~/.doll/wordsall'), commit_changes: bool = False): | ||
"""Creates the database and parses all the inputs | ||
:param words_dir: Directory of wordsall | ||
:type words_dir: str | ||
:param commit_changes: Whether to commit changes to the database | ||
:type commit_changes: bool | ||
def parse_all_inputs(words_folder, commit_changes): | ||
"""Creates the database and parses all the inputs""" | ||
:return None | ||
""" | ||
|
||
# Add a trailing slash if necessary | ||
if (words_folder[-1:] != '/'): | ||
words_folder += '/' | ||
if words_dir[-1:] != '/': | ||
words_dir += '/' | ||
|
||
# First check that our words folder exists | ||
if not os.path.isdir(words_folder): | ||
print('Cannot find words_folder at {0}! Exiting...'.format(words_folder)) | ||
if not os.path.isdir(words_dir): | ||
print('Cannot find words_dir at {0}! Exiting...'.format(words_dir)) | ||
return | ||
|
||
# And then that our input files exist | ||
files_to_find = ['INFLECTS.LAT', 'DICTLINE.GEN'] | ||
error_string = ', '.join([f for f in files_to_find if not os.path.isfile(words_folder + f)]) | ||
error_string = ', '.join([f for f in files_to_find if not os.path.isfile(words_dir + f)]) | ||
|
||
if not error_string == '': | ||
print('Unable to find the following file(s): ' + error_string + '. Exiting...') | ||
return | ||
|
||
'''if os.path.isfile(config['db_file']): | ||
if not input('Database file exists, overwrite? ([Y]es/ No)')[:1] == 'Y': | ||
if os.path.isfile(os.path.expanduser("~/.doll/") + config['db_file']): | ||
if not input('Database file exists, overwrite? (Yes/ No)')[:1] == 'Y': | ||
print('Database file exists, exiting...') | ||
return''' | ||
return | ||
else: | ||
os.remove(os.path.expanduser("~/.doll/") + config['db_file']) | ||
|
||
create_type_contents() | ||
|
||
parse_inflect_file(inflect_file=words_folder + 'INFLECTS.LAT', commit_changes=commit_changes) | ||
parse_inflect_file(inflect_file=words_dir + 'INFLECTS.LAT', commit_changes=commit_changes) | ||
|
||
parse_dict_file(dict_file=words_folder + 'DICTLINE.GEN', commit_changes=commit_changes) | ||
parse_dict_file(dict_file=words_dir + 'DICTLINE.GEN', commit_changes=commit_changes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.