From 6af81314356c89be70ec0d9c889f459dc1957d36 Mon Sep 17 00:00:00 2001 From: Andrei Batomunkuev Date: Sun, 14 Nov 2021 13:42:23 -0500 Subject: [PATCH] Updated Testing: - Refactored process_file function - Added new test cases, new files for testing --- ssg/text.py | 255 ++++++++++++-------- tests/test_files/test_empty_file.txt | 0 tests/test_files/test_unsupported_type.epub | 0 tests/text_test.py | 17 ++ tests/utils_test.py | 2 + 5 files changed, 170 insertions(+), 104 deletions(-) create mode 100644 tests/test_files/test_empty_file.txt create mode 100644 tests/test_files/test_unsupported_type.epub diff --git a/ssg/text.py b/ssg/text.py index c2ff187..71806f4 100644 --- a/ssg/text.py +++ b/ssg/text.py @@ -74,120 +74,167 @@ def process_file(self): - number of paragraphs - paragraphs """ + processed_content = {} contents = self.read_file() - html_content = [] + if not contents: + raise ValueError( + f"Empty file - {self.file_path}. No information to process" + ) + if self.file_path.endswith(".txt"): - # Splitting the content of the file by new line \n\n - splitted_content = contents.split("\n\n") - # handle

title with applied style: text-aligning to the center and margin bottom + processed_content = self.process_txt_file(contents) + elif self.file_path.endswith(".md"): + processed_content = self.process_md_file(contents) + else: + raise Exception( + f"File type - {self.file_path.split('.')[-1]} is not supported!" + ) + return processed_content + + def process_txt_file(self, contents): + """ + Method process the contents of the text (txt) files + Parameters + ---------- + self : Object (class File) + reference to the current instance of the class (TextFile) + contents : String + contents of the file + Returns + ------- + processed_content : Dictionary + Python dictionary containing the processed information: + - title + - number of paragraphs + - paragraphs + """ + + html_content = [] + # Splitting the content of the file by new line \n\n + splitted_content = contents.split("\n\n") + # handle

title with applied style: text-aligning to the center and margin bottom + html_content.append( + "

{title}

".format( + title=splitted_content[0] + ) + ) + # handle the rest of the content, wrapping it up in

tag + for paragraph in splitted_content[1:]: html_content.append( - "

{title}

".format( - title=splitted_content[0] + "

{content}

".format( + content=paragraph.encode("utf8").decode("utf8") ) ) - # handle the rest of the content, wrapping it up in

tag - for paragraph in splitted_content[1:]: + processed_content = { + "title": splitted_content[0], + "content": html_content, + "num_paragraphs": len(splitted_content), + } + return processed_content + + def process_md_file(self, contents): + """ + Method process the contents of the markdown files + Parameters + ---------- + self : Object (class File) + reference to the current instance of the class (TextFile) + contents : String + contents of the file + Returns + ------- + processed_content : Dictionary + Python dictionary containing the processed information: + - title + - number of paragraphs + - paragraphs + """ + content_title = "" + html_content = [] + # Capturing Frontmatter + frontmatter_content = re.findall("^---[\s\S]+?---", contents) + # Removing Frontmatter from the content + contents = re.sub("^---[\s\S]+?---\n", "", contents) + # Splitting the content of the markdown file by a new line \n\n + splitted_content = contents.split("\n\n") + for content in splitted_content: + # regex for .md syntax + reg_h1 = re.compile("[^#]*# (.*$)") + reg_h2 = "(^[^#])*## ([^#]+)*(.*$)" + reg_h3 = "(^[^#])*### ([^#]+)*(.*$)" + reg_italic = "[^\*]?\*([^\*]+)\*[^\*]?" + reg_bold = "[^\*]?\*{2}([^\*]+)\*{2}[^\*]?" + reg_link = "\[(.+)\]\((.+)\)" + reg_p = "(^[^#]*$)" + reg_newline = "\n" + reg_code = "\`(.*)\`" + reg_horizontal_rule = "^---$" + # Handling newline + content = re.sub(reg_newline, "
", content) + # Handling horizontal rule + content = re.sub(reg_horizontal_rule, "


", content) + # Handling italics and bold in italics + content = re.sub( + reg_italic, r"\1", re.sub(reg_bold, r"\1", content) + ) + # Handling bold and italics in bold + content = re.sub( + reg_bold, r"\1", re.sub(reg_italic, r"\1", content) + ) + # Handling code + content = re.sub(reg_code, r"\1", content) + # Handling Headers and paragraphs + content = re.sub(reg_p, r"

\1

", content) + content = re.sub( + reg_h3, + r"\1

\2

\3", + content, + ) + content = re.sub( + reg_h2, + r"\1

\2

\3", + content, + ) + # Handling links + content = re.sub(reg_link, r'\1', content) + if reg_h1.match(content): + content_title = content[1:] html_content.append( - "

{content}

".format( - content=paragraph.encode("utf8").decode("utf8") + "

{title}

".format( + title=content_title ) ) - processed_content = { - "title": splitted_content[0], - "content": html_content, - "num_paragraphs": len(splitted_content), - } - elif self.file_path.endswith(".md"): - content_title = "" - # Capturing Frontmatter - frontmatter_content = re.findall("^---[\s\S]+?---", contents) - # Removing Frontmatter from the content - contents = re.sub("^---[\s\S]+?---\n", "", contents) - # Splitting the content of the markdown file by a new line \n\n - splitted_content = contents.split("\n\n") - for content in splitted_content: - # regex for .md syntax - reg_h1 = re.compile("[^#]*# (.*$)") - reg_h2 = "(^[^#])*## ([^#]+)*(.*$)" - reg_h3 = "(^[^#])*### ([^#]+)*(.*$)" - reg_italic = "[^\*]?\*([^\*]+)\*[^\*]?" - reg_bold = "[^\*]?\*{2}([^\*]+)\*{2}[^\*]?" - reg_link = "\[(.+)\]\((.+)\)" - reg_p = "(^[^#]*$)" - reg_newline = "\n" - reg_code = "\`(.*)\`" - reg_horizontal_rule = "^---$" - - # Handling newline - content = re.sub(reg_newline, "
", content) - # Handling horizontal rule - content = re.sub(reg_horizontal_rule, "
", content) - # Handling italics and bold in italics - content = re.sub( - reg_italic, r"\1", re.sub(reg_bold, r"\1", content) - ) - # Handling bold and italics in bold - content = re.sub( - reg_bold, r"\1", re.sub(reg_italic, r"\1", content) - ) - # Handling code - content = re.sub(reg_code, r"\1", content) - # Handling Headers and paragraphs - content = re.sub(reg_p, r"

\1

", content) - content = re.sub( - reg_h3, - r"\1

\2

\3", - content, - ) - content = re.sub( - reg_h2, - r"\1

\2

\3", - content, + else: + html_content.append( + "{content}".format(content=content.encode("utf8").decode("utf8")) ) - # Handling links - content = re.sub(reg_link, r'\1', content) - - if reg_h1.match(content): - content_title = content[1:] - html_content.append( - "

{title}

".format( - title=content_title - ) - ) - else: - html_content.append( - "{content}".format( - content=content.encode("utf8").decode("utf8") - ) - ) - - processed_content = { - "title": content_title, - "content": html_content, - "num_paragraphs": len(splitted_content), - } - # Processing Markdown Formatter - # Extracting title field - if re.findall(r"title:\s*(.*)", frontmatter_content[0]): - processed_content["title"] = re.findall( - r"title:\s*(.*)", frontmatter_content[0] - )[0] - # Extracting description field - if re.findall(r"description:\s*(.*)", frontmatter_content[0]): - processed_content["description"] = re.findall( - r"description:\s*(.*)", frontmatter_content[0] - )[0] - # Extracting upload date field - if re.findall(r"upload_date:\s*(.*)", frontmatter_content[0]): - processed_content["upload_date"] = re.findall( - r"upload_date:\s*(.*)", frontmatter_content[0] - )[0] - # Extracting author field - if re.findall(r"author:\s*(.*)", frontmatter_content[0]): - processed_content["author"] = re.findall( - r"author:\s*(.*)", frontmatter_content[0] - )[0] + processed_content = { + "title": content_title, + "content": html_content, + "num_paragraphs": len(splitted_content), + } + # Processing Markdown Formatter + # Extracting title field + if re.findall(r"title:\s*(.*)", frontmatter_content[0]): + processed_content["title"] = re.findall( + r"title:\s*(.*)", frontmatter_content[0] + )[0] + # Extracting description field + if re.findall(r"description:\s*(.*)", frontmatter_content[0]): + processed_content["description"] = re.findall( + r"description:\s*(.*)", frontmatter_content[0] + )[0] + # Extracting upload date field + if re.findall(r"upload_date:\s*(.*)", frontmatter_content[0]): + processed_content["upload_date"] = re.findall( + r"upload_date:\s*(.*)", frontmatter_content[0] + )[0] + # Extracting author field + if re.findall(r"author:\s*(.*)", frontmatter_content[0]): + processed_content["author"] = re.findall( + r"author:\s*(.*)", frontmatter_content[0] + )[0] return processed_content def generate_html(self): diff --git a/tests/test_files/test_empty_file.txt b/tests/test_files/test_empty_file.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_files/test_unsupported_type.epub b/tests/test_files/test_unsupported_type.epub new file mode 100644 index 0000000..e69de29 diff --git a/tests/text_test.py b/tests/text_test.py index d505f49..7797494 100644 --- a/tests/text_test.py +++ b/tests/text_test.py @@ -1,5 +1,6 @@ import os import shutil +import pytest from ssg import TextFile, determine_path, OUTPUT_DIR @@ -37,3 +38,19 @@ def test_genetate_html(self): assert os.path.isfile( dir_path + "/test_file.html" ), "HTML file should exist in OUTPUT directory" + + def test_process_file_invalid_file(self): + parsed_arg_obj = {"input": "./tests/test_files/test_unsupported_type.epub"} + path_obj = determine_path(parsed_arg_obj) + text_obj = TextFile(path_obj["file_path"], path_obj["dir_path"]) + # Should throw error + with pytest.raises(Exception): + text_obj.process_file() + + def test_process_empty_file(self): + parsed_arg_obj = {"input": "./tests/test_files/test_empty_file.txt"} + path_obj = determine_path(parsed_arg_obj) + text_obj = TextFile(path_obj["file_path"], path_obj["dir_path"]) + # Should throw error + with pytest.raises(Exception): + text_obj.process_file() diff --git a/tests/utils_test.py b/tests/utils_test.py index e217ca5..bc11f85 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -12,6 +12,7 @@ def test_determine_path_good_file(self): """ parsed_arg_obj = {"input": "./tests/test_files/test_file.md"} path_obj = determine_path(parsed_arg_obj) + # By default all the attributes of path_obj is set to None assert ( path_obj["file_path"] is not None ), "Path object should contain a path to the file" @@ -38,6 +39,7 @@ def test_determine_path_dir_file(self): """ parsed_arg_obj = {"input": "./tests/test_files"} path_obj = determine_path(parsed_arg_obj) + # By default all the attributes of path_obj is set to None assert ( path_obj["file_names"] is not None ), "Path object should contain file_names that are in directory"