Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support: PdfMinor support adding #45

Merged
merged 3 commits into from
Oct 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ sh setup.sh
pip install audiobook
```

## Usages

The audiobook is a python module for listening to your favourite PDF book.


```python
from audiobook import AudioBook
# argument: Speech-Speed="slow/normal/fast", volume = 0.0 to 1.0
Expand All @@ -48,16 +53,25 @@ ab.save_audio(self, input_book_path, password=None, save_page_wise=False):
- input_book_path: path to pdf file
- password: password to pdf file
- save_page_wise: if True, saves each page as a separate mp3 file
- extraction_engine: "pypdf2/pdfminor" for extracting text from pdf file

ab.read_book(file_path) # listen to the book
ab.create_json_book(file_path) # create json file of the book

ab.get_library() # get all the books in your library
```

## Usages
## Supported File Formats

The audiobook is a python module for listening to your favourite PDF book.
| File Format | Supported | Engine |
| :--- | :---: | :---: |
| PDF | :white_check_mark: | pypdf2/pdfminor |
| TXT | :white_check_mark: | not required |
| EPUB | :white_check_mark: | not required |
| MOBI | :white_check_mark: | not required |
| HTML | :white_check_mark: | not required |
| DOCX | :white_check_mark: | not required |
| ODT | :x: | not required |

## Test

Expand Down Expand Up @@ -93,12 +107,17 @@ This project is currently in development. Any contributions are welcome.

## Changelog

**V2.0.2**
- [x] Docs files support added
- [x] Pdfminor as engine added

**V2.0.1**

- [x] Mobi file support
- [x] Epub file support
- [x] User can now save the audiobook for future
- [x] User library added
- [

**V2.0.0**

Expand Down
Empty file.
74 changes: 74 additions & 0 deletions audiobook/doc_parser/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import io

import PyPDF2

from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage


class PdfMinerDocParser(object):
"""

PdfMinor Doc Parser:
1. get_metadata : get metadata of pdf file
2. get_text : convert pdf to text
3. get_toc : get table of contents if available

"""
def __init__(self):
self.laparams = LAParams(char_margin=1, line_margin=0.5, all_texts=True)

def get_metadata(self):
pass

def get_text(self, filepath, password=None, maxpages=0, caching=True):
""" function to read all the text from pdf file """
pagenos = set()
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()

device = TextConverter(rsrcmgr, retstr, laparams=self.laparams)

with open(filepath, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)

for page in PDFPage.get_pages(fp=fp, pagenos=pagenos, maxpages=maxpages, password=password, caching=caching):
interpreter.process_page(page)

pdf_data = retstr.getvalue()
return pdf_data


class PyPDF2DocParser(object):
"""
PyPdf2 Doc Parser:

methods:
1. get_metadata : get metadata of pdf file
2. get_text : convert pdf to text
3. get_toc : get table of contents if available

"""
def __init__(self):
pass

def get_metadata(self):
pass

def get_text(self, filepath, password=None, maxpages=0):
""" function to read all the text from pdf file """
pdf_data = ""
with open(filepath, "rb") as fp:
pdfReader = PyPDF2.PdfFileReader(fp)
if password:
pdfReader.decrypt(password)
num_pages = pdfReader.numPages
if maxpages:
num_pages = min(num_pages, maxpages)
for i in range(num_pages):
pageObj = pdfReader.getPage(i)
pdf_data += pageObj.extractText() # BUG: Page 1Page 2Page 3Page 4Page 5
return pdf_data
File renamed without changes.
14 changes: 8 additions & 6 deletions audiobook/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_library(self):
)
return total_books

def create_json_book(self, input_book_path, password=None):
def create_json_book(self, input_book_path, password=None, extraction_engine=None):
"""method to create json book from input file
it calls respective method based on file format"""
json_filename = (
Expand All @@ -77,7 +77,7 @@ def create_json_book(self, input_book_path, password=None):
return json_book, metadata

elif input_book_path.endswith(".pdf"):
json_book, metadata = pdf_to_json(input_book_path, password)
json_book, metadata = pdf_to_json(input_book_path, password, extraction_engine=extraction_engine)
elif input_book_path.endswith(".txt"):
json_book, metadata = txt_to_json(input_book_path)
elif input_book_path.endswith(".epub"):
Expand All @@ -93,9 +93,10 @@ def create_json_book(self, input_book_path, password=None):

return json_book, metadata

def save_audio(self, input_book_path, password=None, save_page_wise=False):
def save_audio(self, input_book_path, password=None, save_page_wise=False, extraction_engine=None):
"""method to save audio files in folder"""
json_book, metadata = self.create_json_book(input_book_path, password)

json_book, metadata = self.create_json_book(input_book_path, password, extraction_engine)

book_name = metadata["book_name"]
os.makedirs(book_name, exist_ok=True)
Expand All @@ -120,12 +121,12 @@ def save_audio(self, input_book_path, password=None, save_page_wise=False):
)
self.engine.runAndWait()

def read_book(self, input_book_path, password=None):
def read_book(self, input_book_path, password=None, extraction_engine=None):
"""method to read the book

input_book_path: filepath, url path or book name
"""
json_book, metadata = self.create_json_book(input_book_path, password)
json_book, metadata = self.create_json_book(input_book_path, password, extraction_engine)

pages = metadata["pages"]

Expand Down Expand Up @@ -154,6 +155,7 @@ def read_book(self, input_book_path, password=None):
"3. Type 'n' to read next page\n "
"4. Type 'q' to quit:\n "
"5. Type page number to read that page:\n"

user_input = input(input_message)
if user_input == "r":
speak_text(self.engine, f"Reading page {str(start_page+1)}")
Expand Down
48 changes: 22 additions & 26 deletions audiobook/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
import ebooklib
import html2text
import mobi
import PyPDF2
from bs4 import BeautifulSoup
from ebooklib import epub

from audiobook.article_web_scraper import ArticleWebScraper

regex = re.compile(r"[\n\r\t]")
from audiobook.doc_parser.web_parser import ArticleWebScraper
from audiobook.doc_parser.pdf_parser import PyPDF2DocParser
from audiobook.doc_parser.pdf_parser import PdfMinerDocParser


def load_json(filename):
Expand All @@ -27,6 +26,7 @@ def write_json_file(json_data, filename):

def text_preprocessing(input_text):
"""function to preprocess text"""
regex = re.compile(r"[\n\r\t]")
preprocessed_text = regex.sub("", input_text)
preprocessed_text = re.sub(" +", " ", preprocessed_text)
return preprocessed_text
Expand Down Expand Up @@ -73,32 +73,28 @@ def mobi_to_json(input_book_path):
return json_book, metadata


def pdf_to_json(input_book_path, password=None):
def pdf_to_json(input_book_path, password=None, extraction_engine="pypdf2"):
"""sub method to create json book from pdf file"""
metadata = {}
json_book = {}
book_name = os.path.basename(input_book_path).split(".")[0]
with open(input_book_path, "rb") as fp:
pdfReader = PyPDF2.PdfFileReader(fp)
if pdfReader.isEncrypted:
pdfReader.decrypt(password)

information = pdfReader.getDocumentInfo()

metadata["author"] = information.author
metadata["creator"] = information.creator
metadata["producer"] = information.producer
metadata["subject"] = information.subject
metadata["title"] = information.title
metadata["pages"] = pdfReader.numPages
metadata["book_name"] = book_name

pages = pdfReader.numPages
for page_num in range(0, pages):
pageObj = pdfReader.getPage(page_num)
extracted_text = pageObj.extractText()
json_book[str(page_num)] = extracted_text

if extraction_engine == "pdfminer":
print("Using pdfminer")
pdf_parser = PdfMinerDocParser()
elif extraction_engine == "pypdf2":
print("Using pypdf2")
pdf_parser = PyPDF2DocParser()
else:
raise NotImplementedError("Only pdfminer and pypdf2 are supported")

text = pdf_parser.get_text(input_book_path, password=password)
text = text_preprocessing(text)

for i in range(0, len(text), 2000):
page_num = i // 2000
json_book[str(page_num)] = text[i: i + 2000]

metadata = len(json_book)
return json_book, metadata


Expand Down