Skip to content

Commit

Permalink
Updated Testing:
Browse files Browse the repository at this point in the history
- Refactored process_file function
- Added new test cases, new files for testing
  • Loading branch information
abatomunkuev committed Nov 14, 2021
1 parent ba6acaf commit 6af8131
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 104 deletions.
255 changes: 151 additions & 104 deletions ssg/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,120 +74,167 @@ def process_file(self):
- number of paragraphs
- paragraphs
"""
processed_content = {}
contents = self.read_file()
html_content = []
if not contents:
raise ValueError(
f"Empty file - {self.file_path}. No information to process"
)

if self.file_path.endswith(".txt"):
# Splitting the content of the file by new line \n\n
splitted_content = contents.split("\n\n")
# handle <h1> title with applied style: text-aligning to the center and margin bottom
processed_content = self.process_txt_file(contents)
elif self.file_path.endswith(".md"):
processed_content = self.process_md_file(contents)
else:
raise Exception(
f"File type - {self.file_path.split('.')[-1]} is not supported!"
)
return processed_content

def process_txt_file(self, contents):
"""
Method process the contents of the text (txt) files
Parameters
----------
self : Object (class File)
reference to the current instance of the class (TextFile)
contents : String
contents of the file
Returns
-------
processed_content : Dictionary
Python dictionary containing the processed information:
- title
- number of paragraphs
- paragraphs
"""

html_content = []
# Splitting the content of the file by new line \n\n
splitted_content = contents.split("\n\n")
# handle <h1> title with applied style: text-aligning to the center and margin bottom
html_content.append(
"<h1 style='text-align: center; margin-bottom: 15px'>{title}</h1>".format(
title=splitted_content[0]
)
)
# handle the rest of the content, wrapping it up in <p> tag
for paragraph in splitted_content[1:]:
html_content.append(
"<h1 style='text-align: center; margin-bottom: 15px'>{title}</h1>".format(
title=splitted_content[0]
"<p>{content}</p>".format(
content=paragraph.encode("utf8").decode("utf8")
)
)
# handle the rest of the content, wrapping it up in <p> tag
for paragraph in splitted_content[1:]:
processed_content = {
"title": splitted_content[0],
"content": html_content,
"num_paragraphs": len(splitted_content),
}
return processed_content

def process_md_file(self, contents):
"""
Method process the contents of the markdown files
Parameters
----------
self : Object (class File)
reference to the current instance of the class (TextFile)
contents : String
contents of the file
Returns
-------
processed_content : Dictionary
Python dictionary containing the processed information:
- title
- number of paragraphs
- paragraphs
"""
content_title = ""
html_content = []
# Capturing Frontmatter
frontmatter_content = re.findall("^---[\s\S]+?---", contents)
# Removing Frontmatter from the content
contents = re.sub("^---[\s\S]+?---\n", "", contents)
# Splitting the content of the markdown file by a new line \n\n
splitted_content = contents.split("\n\n")
for content in splitted_content:
# regex for .md syntax
reg_h1 = re.compile("[^#]*# (.*$)")
reg_h2 = "(^[^#])*## ([^#]+)*(.*$)"
reg_h3 = "(^[^#])*### ([^#]+)*(.*$)"
reg_italic = "[^\*]?\*([^\*]+)\*[^\*]?"
reg_bold = "[^\*]?\*{2}([^\*]+)\*{2}[^\*]?"
reg_link = "\[(.+)\]\((.+)\)"
reg_p = "(^[^#]*$)"
reg_newline = "\n"
reg_code = "\`(.*)\`"
reg_horizontal_rule = "^---$"
# Handling newline
content = re.sub(reg_newline, "<br>", content)
# Handling horizontal rule
content = re.sub(reg_horizontal_rule, "<hr>", content)
# Handling italics and bold in italics
content = re.sub(
reg_italic, r"<i>\1</i>", re.sub(reg_bold, r"<b>\1</b>", content)
)
# Handling bold and italics in bold
content = re.sub(
reg_bold, r"<b>\1</b>", re.sub(reg_italic, r"<i>\1</i>", content)
)
# Handling code
content = re.sub(reg_code, r"<code>\1</code>", content)
# Handling Headers and paragraphs
content = re.sub(reg_p, r"<p>\1</p>", content)
content = re.sub(
reg_h3,
r"\1<h3 style='text-align: center; margin-bottom: 15px'>\2</h3>\3",
content,
)
content = re.sub(
reg_h2,
r"\1<h2 style='text-align: center; margin-bottom: 15px'>\2</h2>\3",
content,
)
# Handling links
content = re.sub(reg_link, r'<a href="\2">\1</a>', content)
if reg_h1.match(content):
content_title = content[1:]
html_content.append(
"<p>{content}</p>".format(
content=paragraph.encode("utf8").decode("utf8")
"<h1 style='text-align: center; margin-bottom: 15px'>{title}</h1>".format(
title=content_title
)
)
processed_content = {
"title": splitted_content[0],
"content": html_content,
"num_paragraphs": len(splitted_content),
}
elif self.file_path.endswith(".md"):
content_title = ""
# Capturing Frontmatter
frontmatter_content = re.findall("^---[\s\S]+?---", contents)
# Removing Frontmatter from the content
contents = re.sub("^---[\s\S]+?---\n", "", contents)
# Splitting the content of the markdown file by a new line \n\n
splitted_content = contents.split("\n\n")
for content in splitted_content:
# regex for .md syntax
reg_h1 = re.compile("[^#]*# (.*$)")
reg_h2 = "(^[^#])*## ([^#]+)*(.*$)"
reg_h3 = "(^[^#])*### ([^#]+)*(.*$)"
reg_italic = "[^\*]?\*([^\*]+)\*[^\*]?"
reg_bold = "[^\*]?\*{2}([^\*]+)\*{2}[^\*]?"
reg_link = "\[(.+)\]\((.+)\)"
reg_p = "(^[^#]*$)"
reg_newline = "\n"
reg_code = "\`(.*)\`"
reg_horizontal_rule = "^---$"

# Handling newline
content = re.sub(reg_newline, "<br>", content)
# Handling horizontal rule
content = re.sub(reg_horizontal_rule, "<hr>", content)
# Handling italics and bold in italics
content = re.sub(
reg_italic, r"<i>\1</i>", re.sub(reg_bold, r"<b>\1</b>", content)
)
# Handling bold and italics in bold
content = re.sub(
reg_bold, r"<b>\1</b>", re.sub(reg_italic, r"<i>\1</i>", content)
)
# Handling code
content = re.sub(reg_code, r"<code>\1</code>", content)
# Handling Headers and paragraphs
content = re.sub(reg_p, r"<p>\1</p>", content)
content = re.sub(
reg_h3,
r"\1<h3 style='text-align: center; margin-bottom: 15px'>\2</h3>\3",
content,
)
content = re.sub(
reg_h2,
r"\1<h2 style='text-align: center; margin-bottom: 15px'>\2</h2>\3",
content,
else:
html_content.append(
"{content}".format(content=content.encode("utf8").decode("utf8"))
)
# Handling links
content = re.sub(reg_link, r'<a href="\2">\1</a>', content)

if reg_h1.match(content):
content_title = content[1:]
html_content.append(
"<h1 style='text-align: center; margin-bottom: 15px'>{title}</h1>".format(
title=content_title
)
)
else:
html_content.append(
"{content}".format(
content=content.encode("utf8").decode("utf8")
)
)

processed_content = {
"title": content_title,
"content": html_content,
"num_paragraphs": len(splitted_content),
}
# Processing Markdown Formatter
# Extracting title field
if re.findall(r"title:\s*(.*)", frontmatter_content[0]):
processed_content["title"] = re.findall(
r"title:\s*(.*)", frontmatter_content[0]
)[0]
# Extracting description field
if re.findall(r"description:\s*(.*)", frontmatter_content[0]):
processed_content["description"] = re.findall(
r"description:\s*(.*)", frontmatter_content[0]
)[0]
# Extracting upload date field
if re.findall(r"upload_date:\s*(.*)", frontmatter_content[0]):
processed_content["upload_date"] = re.findall(
r"upload_date:\s*(.*)", frontmatter_content[0]
)[0]
# Extracting author field
if re.findall(r"author:\s*(.*)", frontmatter_content[0]):
processed_content["author"] = re.findall(
r"author:\s*(.*)", frontmatter_content[0]
)[0]

processed_content = {
"title": content_title,
"content": html_content,
"num_paragraphs": len(splitted_content),
}
# Processing Markdown Formatter
# Extracting title field
if re.findall(r"title:\s*(.*)", frontmatter_content[0]):
processed_content["title"] = re.findall(
r"title:\s*(.*)", frontmatter_content[0]
)[0]
# Extracting description field
if re.findall(r"description:\s*(.*)", frontmatter_content[0]):
processed_content["description"] = re.findall(
r"description:\s*(.*)", frontmatter_content[0]
)[0]
# Extracting upload date field
if re.findall(r"upload_date:\s*(.*)", frontmatter_content[0]):
processed_content["upload_date"] = re.findall(
r"upload_date:\s*(.*)", frontmatter_content[0]
)[0]
# Extracting author field
if re.findall(r"author:\s*(.*)", frontmatter_content[0]):
processed_content["author"] = re.findall(
r"author:\s*(.*)", frontmatter_content[0]
)[0]
return processed_content

def generate_html(self):
Expand Down
Empty file.
Empty file.
17 changes: 17 additions & 0 deletions tests/text_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import shutil
import pytest
from ssg import TextFile, determine_path, OUTPUT_DIR


Expand Down Expand Up @@ -37,3 +38,19 @@ def test_genetate_html(self):
assert os.path.isfile(
dir_path + "/test_file.html"
), "HTML file should exist in OUTPUT directory"

def test_process_file_invalid_file(self):
parsed_arg_obj = {"input": "./tests/test_files/test_unsupported_type.epub"}
path_obj = determine_path(parsed_arg_obj)
text_obj = TextFile(path_obj["file_path"], path_obj["dir_path"])
# Should throw error
with pytest.raises(Exception):
text_obj.process_file()

def test_process_empty_file(self):
parsed_arg_obj = {"input": "./tests/test_files/test_empty_file.txt"}
path_obj = determine_path(parsed_arg_obj)
text_obj = TextFile(path_obj["file_path"], path_obj["dir_path"])
# Should throw error
with pytest.raises(Exception):
text_obj.process_file()
2 changes: 2 additions & 0 deletions tests/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def test_determine_path_good_file(self):
"""
parsed_arg_obj = {"input": "./tests/test_files/test_file.md"}
path_obj = determine_path(parsed_arg_obj)
# By default all the attributes of path_obj is set to None
assert (
path_obj["file_path"] is not None
), "Path object should contain a path to the file"
Expand All @@ -38,6 +39,7 @@ def test_determine_path_dir_file(self):
"""
parsed_arg_obj = {"input": "./tests/test_files"}
path_obj = determine_path(parsed_arg_obj)
# By default all the attributes of path_obj is set to None
assert (
path_obj["file_names"] is not None
), "Path object should contain file_names that are in directory"
Expand Down

0 comments on commit 6af8131

Please sign in to comment.