-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpodcasts_summary.py
87 lines (76 loc) · 3.04 KB
/
podcasts_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import json
import requests
import xmltodict
from airflow.decorators import dag, task
import pendulum
from airflow.providers.sqlite.operators.sqlite import SqliteOperator
from airflow.providers.sqlite.hooks.sqlite import SqliteHook
PODCAST_URL = ""
EPISODE_FOLDER = "C:/Users/jaswanth/Airflow Workspace/airflow-docker/dags/episodes"
@dag(
dag_id='podcast_summary',
schedule_interval="@daily",
start_date=pendulum.datetime(2022, 5, 30),
catchup=False,
)
def podcast_summary():
create_database = SqliteOperator(
task_id='create_table_sqlite',
sql=r"""
CREATE TABLE IF NOT EXISTS episodes (
link TEXT PRIMARY KEY,
title TEXT,
filename TEXT,
published TEXT,
description TEXT,
transcript TEXT
);
""",
sqlite_conn_id="podcasts"
)
@task()
def get_episodes():
data = requests.get(PODCAST_URL)
feed = xmltodict.parse(data.text)
episodes = feed["rss"]["channel"]["item"]
print(f"Found {len(episodes)} episodes.")
return episodes
@task()
def load_episodes(episodes):
hook = SqliteHook(sqlite_conn_id="podcasts")
stored_episodes = hook.get_pandas_df("SELECT * from episodes;")
new_episodes = []
for episode in episodes:
if episode["link"] not in stored_episodes["link"].values:
filename = f"{episode['link'].split('/')[-1]}.mp3"
new_episodes.append([episode["link"], episode["title"], episode["pubDate"], episode["description"], filename])
hook.insert_rows(table='episodes', rows=new_episodes, target_fields=["link", "title", "published", "description", "filename"])
@task()
def download_episodes(episodes):
os.makedirs(EPISODE_FOLDER, exist_ok=True)
for episode in episodes:
name_end = episode["link"].split('/')[-1]
filename = f"{name_end}.mp3"
audio_path = os.path.join(EPISODE_FOLDER, filename)
if not os.path.exists(audio_path):
print(f"Downloading {filename}")
audio = requests.get(episode["enclosure"]["@url"])
with open(audio_path, "wb+") as f:
f.write(audio.content)
@task()
def verify_downloads(episodes):
for episode in episodes:
name_end = episode["link"].split('/')[-1]
filename = f"{name_end}.mp3"
audio_path = os.path.join(EPISODE_FOLDER, filename)
if not os.path.exists(audio_path):
print(f"{filename} isn't downloaded")
return
print("All the podcasts were successfully downloaded")
podcast_episodes = get_episodes()
create_database.set_downstream(podcast_episodes)
load = load_episodes(podcast_episodes)
downloads = download_episodes(podcast_episodes)
verify_downloads(podcast_episodes).set_upstream([downloads,load])
summary = podcast_summary()