From e30a03f9a246093a6dfe6ed86c70315f4f5c85b4 Mon Sep 17 00:00:00 2001
From: agatha <agatha@juggalol.com>
Date: Sun, 1 Oct 2023 14:56:18 -0400
Subject: [PATCH] Initial commit

---
 .gitignore       |  4 ++++
 README.md        |  2 ++
 main.py          | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  2 ++
 4 files changed, 65 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 main.py
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c2162ed
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.idea/
+venv/
+
+urls.txt
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d2d4326
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# Podcast Generator URL Scraper
+Shitty code to scrape all media URLs from a site generated with Podcast Generator.
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..c9a80e5
--- /dev/null
+++ b/main.py
@@ -0,0 +1,57 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+def get_total_pages(url):
+    """Get the total page count."""
+    html = fetch_page_html(url)
+    soup = BeautifulSoup(html, 'html.parser')
+
+    pagination_element = soup.find('ul', class_='pagination')
+    page_items = pagination_element.find_all('li', class_='page-item')
+
+    return len(page_items)
+
+
+def fetch_page_html(url):
+    """Fetch a page's HTML"""
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.content
+    else:
+        raise Exception(f'Failed to fetch HTML from {url}')
+
+
+def extract_media_urls(html):
+    """Extracts all media URLs from page HTML."""
+    soup = BeautifulSoup(html, 'html.parser')
+    download_links = soup.find_all('a', attrs={'download': True})
+    media_urls = [link['href'] for link in download_links]
+    return media_urls
+
+
+def get_all_media_urls(base_url):
+    """Extracts all media URLs from a Podcast Generator site."""
+    total_pages = get_total_pages(base_url)
+
+    media_urls = []
+    for page_number in range(1, total_pages + 1):
+        page_url = f'{base_url}/index.php?page={page_number}'
+        html = fetch_page_html(page_url)
+        media_links = extract_media_urls(html)
+
+        for link in media_links:
+            media_urls.append(f'{base_url}/{link}')
+
+    return media_urls
+
+
+def main():
+    base_url = 'https://apcs.rhonk.cloud'
+    media_urls = get_all_media_urls(base_url)
+    with open('urls.txt', 'w', encoding='utf-8') as f:
+        f.write('\n'.join(media_urls))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a98ae43
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+requests
+beautifulsoup4
\ No newline at end of file