From 6af81314356c89be70ec0d9c889f459dc1957d36 Mon Sep 17 00:00:00 2001
From: Andrei Batomunkuev <abatomunkuev@myseneca.ca>
Date: Sun, 14 Nov 2021 13:42:23 -0500
Subject: [PATCH] Updated Testing: - Refactored process_file function - Added
 new test cases, new files for testing

---
 ssg/text.py                                 | 255 ++++++++++++--------
 tests/test_files/test_empty_file.txt        |   0
 tests/test_files/test_unsupported_type.epub |   0
 tests/text_test.py                          |  17 ++
 tests/utils_test.py                         |   2 +
 5 files changed, 170 insertions(+), 104 deletions(-)
 create mode 100644 tests/test_files/test_empty_file.txt
 create mode 100644 tests/test_files/test_unsupported_type.epub
diff --git a/ssg/text.py b/ssg/text.py
index c2ff187..71806f4 100644
--- a/ssg/text.py
+++ b/ssg/text.py
@@ -74,120 +74,167 @@ def process_file(self):
             - number of paragraphs
             - paragraphs
         """
+        processed_content = {}
         contents = self.read_file()
-        html_content = []
+        if not contents:
+            raise ValueError(
+                f"Empty file - {self.file_path}. No information to process"
+            )
+
         if self.file_path.endswith(".txt"):
-            # Splitting the content of the file by new line \n\n
-            splitted_content = contents.split("\n\n")
-            # handle <h1> title with applied style: text-aligning to the center and margin bottom
+            processed_content = self.process_txt_file(contents)
+        elif self.file_path.endswith(".md"):
+            processed_content = self.process_md_file(contents)
+        else:
+            raise Exception(
+                f"File type - {self.file_path.split('.')[-1]} is not supported!"
+            )
+        return processed_content
+
+    def process_txt_file(self, contents):
+        """
+        Method process the contents of the text (txt) files
+        Parameters
+        ----------
+        self : Object (class File)
+            reference to the current instance of the class (TextFile)
+        contents : String
+            contents of the file
+        Returns
+        -------
+        processed_content : Dictionary
+            Python dictionary containing the processed information:
+            - title
+            - number of paragraphs
+            - paragraphs
+        """
+
+        html_content = []
+        # Splitting the content of the file by new line \n\n
+        splitted_content = contents.split("\n\n")
+        # handle <h1> title with applied style: text-aligning to the center and margin bottom
+        html_content.append(
+            "<h1 style='text-align: center; margin-bottom: 15px'>{title}</h1>".format(
+                title=splitted_content[0]
+            )
+        )
+        # handle the rest of the content, wrapping it up in <p> tag
+        for paragraph in splitted_content[1:]:
             html_content.append(
-                "<h1 style='text-align: center; margin-bottom: 15px'>{title}</h1>".format(
-                    title=splitted_content[0]
+                "<p>{content}</p>".format(
+                    content=paragraph.encode("utf8").decode("utf8")
                 )
             )
-            # handle the rest of the content, wrapping it up in <p> tag
-            for paragraph in splitted_content[1:]:
+        processed_content = {
+            "title": splitted_content[0],
+            "content": html_content,
+            "num_paragraphs": len(splitted_content),
+        }
+        return processed_content
+
+    def process_md_file(self, contents):
+        """
+        Method process the contents of the markdown files
+        Parameters
+        ----------
+        self : Object (class File)
+            reference to the current instance of the class (TextFile)
+        contents : String
+            contents of the file
+        Returns
+        -------
+        processed_content : Dictionary
+            Python dictionary containing the processed information:
+            - title
+            - number of paragraphs
+            - paragraphs
+        """
+        content_title = ""
+        html_content = []
+        # Capturing Frontmatter
+        frontmatter_content = re.findall("^---[\s\S]+?---", contents)
+        # Removing Frontmatter from the content
+        contents = re.sub("^---[\s\S]+?---\n", "", contents)
+        # Splitting the content of the markdown file by a new line \n\n
+        splitted_content = contents.split("\n\n")
+        for content in splitted_content:
+            # regex for .md syntax
+            reg_h1 = re.compile("[^#]*# (.*$)")
+            reg_h2 = "(^[^#])*## ([^#]+)*(.*$)"
+            reg_h3 = "(^[^#])*### ([^#]+)*(.*$)"
+            reg_italic = "[^\*]?\*([^\*]+)\*[^\*]?"
+            reg_bold = "[^\*]?\*{2}([^\*]+)\*{2}[^\*]?"
+            reg_link = "\[(.+)\]\((.+)\)"
+            reg_p = "(^[^#]*$)"
+            reg_newline = "\n"
+            reg_code = "\`(.*)\`"
+            reg_horizontal_rule = "^---$"
+            # Handling newline
+            content = re.sub(reg_newline, "<br>", content)
+            # Handling horizontal rule
+            content = re.sub(reg_horizontal_rule, "<hr>", content)
+            # Handling italics and bold in italics
+            content = re.sub(
+                reg_italic, r"<i>\1</i>", re.sub(reg_bold, r"<b>\1</b>", content)
+            )
+            # Handling bold and italics in bold
+            content = re.sub(
+                reg_bold, r"<b>\1</b>", re.sub(reg_italic, r"<i>\1</i>", content)
+            )
+            # Handling code
+            content = re.sub(reg_code, r"<code>\1</code>", content)
+            # Handling Headers and paragraphs
+            content = re.sub(reg_p, r"<p>\1</p>", content)
+            content = re.sub(
+                reg_h3,
+                r"\1<h3 style='text-align: center; margin-bottom: 15px'>\2</h3>\3",
+                content,
+            )
+            content = re.sub(
+                reg_h2,
+                r"\1<h2 style='text-align: center; margin-bottom: 15px'>\2</h2>\3",
+                content,
+            )
+            # Handling links
+            content = re.sub(reg_link, r'<a href="\2">\1</a>', content)
+            if reg_h1.match(content):
+                content_title = content[1:]
                 html_content.append(
-                    "<p>{content}</p>".format(
-                        content=paragraph.encode("utf8").decode("utf8")
+                    "<h1 style='text-align: center; margin-bottom: 15px'>{title}</h1>".format(
+                        title=content_title
                     )
                 )
-            processed_content = {
-                "title": splitted_content[0],
-                "content": html_content,
-                "num_paragraphs": len(splitted_content),
-            }
-        elif self.file_path.endswith(".md"):
-            content_title = ""
-            # Capturing Frontmatter
-            frontmatter_content = re.findall("^---[\s\S]+?---", contents)
-            # Removing Frontmatter from the content
-            contents = re.sub("^---[\s\S]+?---\n", "", contents)
-            # Splitting the content of the markdown file by a new line \n\n
-            splitted_content = contents.split("\n\n")
-            for content in splitted_content:
-                # regex for .md syntax
-                reg_h1 = re.compile("[^#]*# (.*$)")
-                reg_h2 = "(^[^#])*## ([^#]+)*(.*$)"
-                reg_h3 = "(^[^#])*### ([^#]+)*(.*$)"
-                reg_italic = "[^\*]?\*([^\*]+)\*[^\*]?"
-                reg_bold = "[^\*]?\*{2}([^\*]+)\*{2}[^\*]?"
-                reg_link = "\[(.+)\]\((.+)\)"
-                reg_p = "(^[^#]*$)"
-                reg_newline = "\n"
-                reg_code = "\`(.*)\`"
-                reg_horizontal_rule = "^---$"
-
-                # Handling newline
-                content = re.sub(reg_newline, "<br>", content)
-                # Handling horizontal rule
-                content = re.sub(reg_horizontal_rule, "<hr>", content)
-                # Handling italics and bold in italics
-                content = re.sub(
-                    reg_italic, r"<i>\1</i>", re.sub(reg_bold, r"<b>\1</b>", content)
-                )
-                # Handling bold and italics in bold
-                content = re.sub(
-                    reg_bold, r"<b>\1</b>", re.sub(reg_italic, r"<i>\1</i>", content)
-                )
-                # Handling code
-                content = re.sub(reg_code, r"<code>\1</code>", content)
-                # Handling Headers and paragraphs
-                content = re.sub(reg_p, r"<p>\1</p>", content)
-                content = re.sub(
-                    reg_h3,
-                    r"\1<h3 style='text-align: center; margin-bottom: 15px'>\2</h3>\3",
-                    content,
-                )
-                content = re.sub(
-                    reg_h2,
-                    r"\1<h2 style='text-align: center; margin-bottom: 15px'>\2</h2>\3",
-                    content,
+            else:
+                html_content.append(
+                    "{content}".format(content=content.encode("utf8").decode("utf8"))
                 )
-                # Handling links
-                content = re.sub(reg_link, r'<a href="\2">\1</a>', content)
-
-                if reg_h1.match(content):
-                    content_title = content[1:]
-                    html_content.append(
-                        "<h1 style='text-align: center; margin-bottom: 15px'>{title}</h1>".format(
-                            title=content_title
-                        )
-                    )
-                else:
-                    html_content.append(
-                        "{content}".format(
-                            content=content.encode("utf8").decode("utf8")
-                        )
-                    )
-
-            processed_content = {
-                "title": content_title,
-                "content": html_content,
-                "num_paragraphs": len(splitted_content),
-            }
-            # Processing Markdown Formatter
-            # Extracting title field
-            if re.findall(r"title:\s*(.*)", frontmatter_content[0]):
-                processed_content["title"] = re.findall(
-                    r"title:\s*(.*)", frontmatter_content[0]
-                )[0]
-            # Extracting description field
-            if re.findall(r"description:\s*(.*)", frontmatter_content[0]):
-                processed_content["description"] = re.findall(
-                    r"description:\s*(.*)", frontmatter_content[0]
-                )[0]
-            # Extracting upload date field
-            if re.findall(r"upload_date:\s*(.*)", frontmatter_content[0]):
-                processed_content["upload_date"] = re.findall(
-                    r"upload_date:\s*(.*)", frontmatter_content[0]
-                )[0]
-            # Extracting author field
-            if re.findall(r"author:\s*(.*)", frontmatter_content[0]):
-                processed_content["author"] = re.findall(
-                    r"author:\s*(.*)", frontmatter_content[0]
-                )[0]
 
+        processed_content = {
+            "title": content_title,
+            "content": html_content,
+            "num_paragraphs": len(splitted_content),
+        }
+        # Processing Markdown Formatter
+        # Extracting title field
+        if re.findall(r"title:\s*(.*)", frontmatter_content[0]):
+            processed_content["title"] = re.findall(
+                r"title:\s*(.*)", frontmatter_content[0]
+            )[0]
+        # Extracting description field
+        if re.findall(r"description:\s*(.*)", frontmatter_content[0]):
+            processed_content["description"] = re.findall(
+                r"description:\s*(.*)", frontmatter_content[0]
+            )[0]
+        # Extracting upload date field
+        if re.findall(r"upload_date:\s*(.*)", frontmatter_content[0]):
+            processed_content["upload_date"] = re.findall(
+                r"upload_date:\s*(.*)", frontmatter_content[0]
+            )[0]
+        # Extracting author field
+        if re.findall(r"author:\s*(.*)", frontmatter_content[0]):
+            processed_content["author"] = re.findall(
+                r"author:\s*(.*)", frontmatter_content[0]
+            )[0]
         return processed_content
 
     def generate_html(self):
diff --git a/tests/test_files/test_empty_file.txt b/tests/test_files/test_empty_file.txt
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_files/test_unsupported_type.epub b/tests/test_files/test_unsupported_type.epub
new file mode 100644
index 0000000..e69de29
diff --git a/tests/text_test.py b/tests/text_test.py
index d505f49..7797494 100644
--- a/tests/text_test.py
+++ b/tests/text_test.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+import pytest
 from ssg import TextFile, determine_path, OUTPUT_DIR
 
 
@@ -37,3 +38,19 @@ def test_genetate_html(self):
         assert os.path.isfile(
             dir_path + "/test_file.html"
         ), "HTML file should exist in OUTPUT directory"
+
+    def test_process_file_invalid_file(self):
+        parsed_arg_obj = {"input": "./tests/test_files/test_unsupported_type.epub"}
+        path_obj = determine_path(parsed_arg_obj)
+        text_obj = TextFile(path_obj["file_path"], path_obj["dir_path"])
+        # Should throw error
+        with pytest.raises(Exception):
+            text_obj.process_file()
+
+    def test_process_empty_file(self):
+        parsed_arg_obj = {"input": "./tests/test_files/test_empty_file.txt"}
+        path_obj = determine_path(parsed_arg_obj)
+        text_obj = TextFile(path_obj["file_path"], path_obj["dir_path"])
+        # Should throw error
+        with pytest.raises(Exception):
+            text_obj.process_file()
diff --git a/tests/utils_test.py b/tests/utils_test.py
index e217ca5..bc11f85 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -12,6 +12,7 @@ def test_determine_path_good_file(self):
         """
         parsed_arg_obj = {"input": "./tests/test_files/test_file.md"}
         path_obj = determine_path(parsed_arg_obj)
+        # By default all the attributes of path_obj is set to None
         assert (
             path_obj["file_path"] is not None
         ), "Path object should contain a path to the file"
@@ -38,6 +39,7 @@ def test_determine_path_dir_file(self):
         """
         parsed_arg_obj = {"input": "./tests/test_files"}
         path_obj = determine_path(parsed_arg_obj)
+        # By default all the attributes of path_obj is set to None
         assert (
             path_obj["file_names"] is not None
         ), "Path object should contain file_names that are in directory"