Initial commit

2023-10-01 14:56:18 -04:00 · 2023-10-01 14:56:18 -04:00 · e30a03f9a2
commit e30a03f9a2
4 changed files with 65 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+.idea/
+venv/
+
+urls.txt
--- a/README.md
+++ b/README.md
@ -0,0 +1,2 @@
+# Podcast Generator URL Scraper
+Shitty code to scrape all media URLs from a site generated with Podcast Generator.
--- a/main.py
+++ b/main.py
@ -0,0 +1,57 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+def get_total_pages(url):
+    """Get the total page count."""
+    html = fetch_page_html(url)
+    soup = BeautifulSoup(html, 'html.parser')
+
+    pagination_element = soup.find('ul', class_='pagination')
+    page_items = pagination_element.find_all('li', class_='page-item')
+
+    return len(page_items)
+
+
+def fetch_page_html(url):
+    """Fetch a page's HTML"""
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.content
+    else:
+        raise Exception(f'Failed to fetch HTML from {url}')
+
+
+def extract_media_urls(html):
+    """Extracts all media URLs from page HTML."""
+    soup = BeautifulSoup(html, 'html.parser')
+    download_links = soup.find_all('a', attrs={'download': True})
+    media_urls = [link['href'] for link in download_links]
+    return media_urls
+
+
+def get_all_media_urls(base_url):
+    """Extracts all media URLs from a Podcast Generator site."""
+    total_pages = get_total_pages(base_url)
+
+    media_urls = []
+    for page_number in range(1, total_pages + 1):
+        page_url = f'{base_url}/index.php?page={page_number}'
+        html = fetch_page_html(page_url)
+        media_links = extract_media_urls(html)
+
+        for link in media_links:
+            media_urls.append(f'{base_url}/{link}')
+
+    return media_urls
+
+
+def main():
+    base_url = 'https://apcs.rhonk.cloud'
+    media_urls = get_all_media_urls(base_url)
+    with open('urls.txt', 'w', encoding='utf-8') as f:
+        f.write('\n'.join(media_urls))
+
+
+if __name__ == '__main__':
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+requests
+beautifulsoup4