Skip to content

Commit

Permalink
Merge pull request #46 from ivansaul/refactor
Browse files Browse the repository at this point in the history
feat: refine selectors to handle dynamic HTML structures
  • Loading branch information
ivansaul authored Nov 23, 2024
2 parents 19093b0 + 91d8d21 commit 82dd297
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 8 deletions.
4 changes: 2 additions & 2 deletions src/facilito/async_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,9 @@ async def download(self, url: str, **kwargs):

from .downloaders import download_course, download_unit
from .models import TypeUnit
from .utils import is_course, is_lecture, is_video
from .utils import is_course, is_lecture, is_quiz, is_video

if is_video(url) or is_lecture(url):
if is_video(url) or is_lecture(url) or is_quiz(url):
unit = await self.fetch_unit(url)
extension = ".mp4" if unit.type == TypeUnit.VIDEO else ".mhtml"
await download_unit(
Expand Down
6 changes: 3 additions & 3 deletions src/facilito/collectors/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

async def _fetch_course_chapters(page: Page) -> list[Chapter]:
CHAPTERS_SELECTOR = (
".collapsible.no-box-shadow.no-border.f-topics.no-time div.f-top-16"
".collapsible.no-box-shadow.no-border.f-topics.no-time > .f-top-16"
)

try:
Expand All @@ -32,7 +32,7 @@ async def _fetch_course_chapters(page: Page) -> list[Chapter]:
for i in range(chapters_count):
CHAPTER_NAME_SELECTOR = "header h4"
UNITS_SELECTOR = ".collapsible-body ul a"
UNIT_NAME_SELECTOR = ".box p.ibm"
UNIT_NAME_SELECTOR = "p.ibm"

chapter_name = (
await chapters_selectors.nth(i)
Expand Down Expand Up @@ -86,7 +86,7 @@ async def _fetch_course_chapters(page: Page) -> list[Chapter]:


async def fetch_course(context: BrowserContext, url: str) -> Course:
NAME_SELECTOR = ".cover-with-image h1"
NAME_SELECTOR = ".f-course-presentation h1, .cover-with-image h1"

try:
page = await context.new_page()
Expand Down
16 changes: 15 additions & 1 deletion src/facilito/collectors/unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,27 @@

from ..errors import UnitError
from ..helpers import slugify
from ..models import Unit
from ..models import TypeUnit, Unit
from ..utils import get_unit_type


async def fetch_unit(context: BrowserContext, url: str):
NAME_SELECTOR = ".title-section header h1"

try:
type = get_unit_type(url)

if type == TypeUnit.QUIZ:
# TODO: implement quiz fetching
return Unit(
type=type,
url=url,
name="quiz",
slug="quiz",
)
except Exception:
raise UnitError()

try:
page = await context.new_page()
await page.goto(url)
Expand Down
2 changes: 1 addition & 1 deletion src/facilito/downloaders/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ async def download_course(context: BrowserContext, course: Course, **kwargs):
**kwargs,
)

if unit.type == TypeUnit.LECTURE:
else:
await download_unit(
context,
unit,
Expand Down
2 changes: 1 addition & 1 deletion src/facilito/downloaders/unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ async def download_unit(context: BrowserContext, unit: Unit, path: Path, **kwarg
**kwargs,
) # type: ignore

if unit.type == TypeUnit.LECTURE:
else:
await save_page(context, unit.url, path)
1 change: 1 addition & 0 deletions src/facilito/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class Quality(str, Enum):
class TypeUnit(str, Enum):
LECTURE = "lecture"
VIDEO = "video"
QUIZ = "quiz"


class Resource(BaseModel):
Expand Down
18 changes: 18 additions & 0 deletions src/facilito/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,21 @@ def is_course(url: str) -> bool:
return "/cursos/" in url


def is_quiz(url: str) -> bool:
"""
Check if a URL is a quiz.
:param str url: URL to check.
:return bool: True if the URL is a quiz, False otherwise.
Example
-------
>>> is_quiz("https: ..../quizzes/...")
True
"""
return "/quizzes/" in url


def get_unit_type(url: str) -> TypeUnit:
"""
Get the type of a unit from its URL.
Expand All @@ -163,4 +178,7 @@ def get_unit_type(url: str) -> TypeUnit:
if is_lecture(url):
return TypeUnit.LECTURE

if is_quiz(url):
return TypeUnit.QUIZ

raise UnitError()

0 comments on commit 82dd297

Please sign in to comment.