forked from Uberspace/lab
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
140 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,56 +1,121 @@ | ||
import os | ||
from os.path import isfile | ||
from os.path import join | ||
#!.venv/bin/python | ||
""" | ||
Creates `new_words.txt` with words that failed a spellcheck. | ||
# sorts the given dict by the name and removes duplicates. | ||
# writes the result to sorted_dict.txt | ||
def sort_dict(): | ||
with open("source/dict.txt", "r") as f: | ||
words = f.readlines() | ||
If the `--merge` flag is given, the content of that file is merged in the | ||
spelling dictionary instead. | ||
sorted_words = sorted(words, key=lambda v: v.lower()) | ||
Run `make spelling` to run the actual spellcheck. | ||
""" | ||
import argparse | ||
import sys | ||
from collections import Counter | ||
from pathlib import Path | ||
from typing import List | ||
from typing import Optional | ||
|
||
with open("source/sorted_dict.txt", "w") as f: | ||
last = "" | ||
for word in sorted_words: | ||
if last != word: | ||
print(word[:-1], file=f) | ||
last = word | ||
|
||
*_, EPILOG = __doc__.strip().split("\n") | ||
DESCRIPTION = "\n".join(_) | ||
|
||
# reads the result of the spell checking (make spelling), counts for every misspelled words its amount | ||
# and returns a sorted list of word and amount to the shell and creates a new_words.txt just with the | ||
# words found in the analysis. | ||
def read_terms_from_errors(): | ||
BASE_PATH = Path(__file__).parent.absolute() | ||
NEW_WORDS_FILE = BASE_PATH / "new_words.txt" | ||
SPELLCHECK_DIR = BASE_PATH / "build/spelling" | ||
WORDLIST_FILE = BASE_PATH / "source/dict.txt" | ||
|
||
new_words = {} | ||
build_dir = "build/spelling/" | ||
total_words = 0 | ||
|
||
for f in os.listdir(build_dir): | ||
if isfile(join(build_dir, f)): | ||
with open(join(build_dir, f), "r") as f: | ||
lines = f.readlines() | ||
|
||
for line in lines: | ||
def get_misspelled_words(error_dir: Path) -> Counter: | ||
"""Return counter for misspelled words form files in *error_dir*.""" | ||
words = Counter() | ||
for error_file in (e for e in error_dir.glob("*.spelling")): | ||
lines = error_file.read_text().split("\n") | ||
for line in (line for l in lines if (line := l.strip())): | ||
try: | ||
word = line.split("(")[1].split(")")[0] | ||
total_words += 1 | ||
except IndexError: | ||
print("failed to parse line: {line}", file=sys.stderr) | ||
continue | ||
words[word] += 1 | ||
return words | ||
|
||
|
||
def merge_wordfiles(*files: List[Path], out_file: Optional[Path] = None) -> List: | ||
"""Combine words form *files*, return sorted results and write to *out_file* if set.""" | ||
words = set() | ||
for wordfile in (f for f in files if f.exists()): | ||
with wordfile.open() as f: | ||
for line in (line for l in f if (line := l.strip())): | ||
words.add(line) | ||
words = sorted(words, key=lambda word: (word.lower(), word)) | ||
if out_file is not None: | ||
out_file.write_text("\n".join(words) + "\n") | ||
return words | ||
|
||
|
||
def write_errors_to_newlist(error_dir: Path, outfile: Path) -> Counter: | ||
"""Write errors found in *error_dir* to *outfile*, sorted by hit count.""" | ||
if not (error_dir.exists() and error_dir.is_dir()): | ||
raise ValueError(f"'{error_dir}' not found") | ||
error_counter = get_misspelled_words(error_dir) | ||
outfile.write_text("\n".join((t[0] for t in error_counter.most_common()))) | ||
return error_counter | ||
|
||
if word not in new_words: | ||
new_words[word] = 1 | ||
else: | ||
new_words[word] += 1 | ||
|
||
with open("new_words.txt", "w") as f: | ||
for word in sorted(new_words, key=lambda w: new_words[w], reverse=True): | ||
print(word, file=f) | ||
print(word, new_words[word]) | ||
def merge_newlist_to_wordlist(newlist: Path, wordlist: Path) -> List: | ||
"""Merge *newlist* to *wordlist* and return the results.""" | ||
if not (newlist.exists() and newlist.is_file()): | ||
raise ValueError(f"'{newlist}' not found") | ||
return merge_wordfiles(newlist, wordlist, out_file=wordlist) | ||
|
||
print( | ||
"\nFound %d unique words in a total of %d misspelled words" | ||
% (len(new_words), total_words) | ||
|
||
def get_args(argv=None): | ||
ap = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG) | ||
ap.add_argument( | ||
"--merge", action="store_true", help="merge new words to dictionary" | ||
) | ||
return ap.parse_args(argv) | ||
|
||
|
||
def main( | ||
merge=False, | ||
newlist: Path = NEW_WORDS_FILE, | ||
wordlist: Path = WORDLIST_FILE, | ||
error_dir: Path = SPELLCHECK_DIR, | ||
): | ||
""" | ||
Write errors found in *error_dir* to *newlist*, sorted by hit count. | ||
Or merge *newlist* to *wordlist* instead, if *merge* is set. | ||
""" | ||
if merge: | ||
try: | ||
words = merge_newlist_to_wordlist(newlist, wordlist) | ||
print(f"Wrote {len(words)} words to dictionary '{wordlist}'.") | ||
except ValueError: | ||
print( | ||
f"[ERROR] no wordlist found in '{newlist}'.\n", | ||
"You can run without the `--merge` flag, to create it.", | ||
file=sys.stderr, | ||
) | ||
return 4 | ||
else: | ||
try: | ||
error_counter = write_errors_to_newlist(error_dir, newlist) | ||
print( | ||
f"Found {len(error_counter)} unique words in a total of " | ||
f"{sum(error_counter.values())} misspelled words." | ||
) | ||
except ValueError: | ||
print( | ||
f"[ERROR] no spellcheck results found in '{error_dir}'.\n", | ||
"You can run `make spelling` to create them.", | ||
file=sys.stderr, | ||
) | ||
return 3 | ||
|
||
return 0 | ||
|
||
|
||
read_terms_from_errors() | ||
sort_dict() | ||
if __name__ == "__main__": | ||
args = get_args() | ||
sys.exit(main(**vars(args))) |