From e30a03f9a246093a6dfe6ed86c70315f4f5c85b4 Mon Sep 17 00:00:00 2001 From: agatha Date: Sun, 1 Oct 2023 14:56:18 -0400 Subject: [PATCH] Initial commit --- .gitignore | 4 ++++ README.md | 2 ++ main.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 ++ 4 files changed, 65 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 main.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c2162ed --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.idea/ +venv/ + +urls.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..d2d4326 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Podcast Generator URL Scraper +Shitty code to scrape all media URLs from a site generated with Podcast Generator. \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..c9a80e5 --- /dev/null +++ b/main.py @@ -0,0 +1,57 @@ +import requests +from bs4 import BeautifulSoup + + +def get_total_pages(url): + """Get the total page count.""" + html = fetch_page_html(url) + soup = BeautifulSoup(html, 'html.parser') + + pagination_element = soup.find('ul', class_='pagination') + page_items = pagination_element.find_all('li', class_='page-item') + + return len(page_items) + + +def fetch_page_html(url): + """Fetch a page's HTML""" + response = requests.get(url) + if response.status_code == 200: + return response.content + else: + raise Exception(f'Failed to fetch HTML from {url}') + + +def extract_media_urls(html): + """Extracts all media URLs from page HTML.""" + soup = BeautifulSoup(html, 'html.parser') + download_links = soup.find_all('a', attrs={'download': True}) + media_urls = [link['href'] for link in download_links] + return media_urls + + +def get_all_media_urls(base_url): + """Extracts all media URLs from a Podcast Generator site.""" + total_pages = get_total_pages(base_url) + + media_urls = [] + for page_number in range(1, total_pages + 1): + page_url = f'{base_url}/index.php?page={page_number}' + html = fetch_page_html(page_url) + media_links = extract_media_urls(html) + + for link in media_links: + media_urls.append(f'{base_url}/{link}') + + return media_urls + + +def main(): + base_url = 'https://apcs.rhonk.cloud' + media_urls = get_all_media_urls(base_url) + with open('urls.txt', 'w', encoding='utf-8') as f: + f.write('\n'.join(media_urls)) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a98ae43 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +beautifulsoup4 \ No newline at end of file